From 5466f95443b4bd0c2918b34f2ffb23b38b18c0a6 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Thu, 15 Dec 2022 17:33:44 +0100 Subject: [PATCH] start of a more advanced test --- MANIFEST.in | 3 + cwltool/provenance_profile.py | 14 +- tests/test_provenance.py | 21 ++ tests/wf/adv_prov/data/pdb_query.json | 0 .../data/prepared_biolip_win_p_testing.csv | 0 .../data/prepared_biolip_win_p_training.csv | 0 .../data/sabdab_summary_all_20220527.tsv | 0 tests/wf/adv_prov/model_example_params.json | 0 tests/wf/adv_prov/niaa_wf.cwl | 186 ++++++++++++++++++ tests/wf/adv_prov/niaa_wf_job.yml | 78 ++++++++ tests/wf/adv_prov/tools/combine_features.cwl | 88 +++++++++ tests/wf/adv_prov/tools/combine_inputs.py | 0 tests/wf/adv_prov/tools/combine_labels.cwl | 63 ++++++ tests/wf/adv_prov/tools/combine_labels.py | 0 tests/wf/adv_prov/tools/decompress.cwl | 32 +++ tests/wf/adv_prov/tools/dssp.cwl | 102 ++++++++++ tests/wf/adv_prov/tools/dssp_RASA.py | 0 tests/wf/adv_prov/tools/emulated_model.py | 0 .../tools/epitope_annotation_pipeline.py | 0 .../wf/adv_prov/tools/epitope_annotations.cwl | 100 ++++++++++ tests/wf/adv_prov/tools/get_pc7_inputs.py | 0 tests/wf/adv_prov/tools/get_psp19_inputs.py | 0 .../wf/adv_prov/tools/hhm_inputs_scatter.cwl | 70 +++++++ tests/wf/adv_prov/tools/pc7_inputs.cwl | 63 ++++++ .../wf/adv_prov/tools/pdb_batch_download.cwl | 88 +++++++++ tests/wf/adv_prov/tools/pdb_batch_download.sh | 0 tests/wf/adv_prov/tools/pdb_query.cwl | 76 +++++++ tests/wf/adv_prov/tools/pdb_query.py | 0 tests/wf/adv_prov/tools/ppi_annotations.cwl | 77 ++++++++ tests/wf/adv_prov/tools/ppi_annotations.py | 0 tests/wf/adv_prov/tools/process_sabdab.cwl | 67 +++++++ .../adv_prov/tools/process_sabdab_summary.py | 0 tests/wf/adv_prov/tools/psp19_inputs.cwl | 54 +++++ .../wf/adv_prov/tools/train_epitope_model.cwl | 68 +++++++ 34 files changed, 1246 insertions(+), 4 deletions(-) create mode 100644 tests/wf/adv_prov/data/pdb_query.json create mode 100644 tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv create mode 100644 tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv create mode 100644 tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv create mode 100644 tests/wf/adv_prov/model_example_params.json create mode 100644 tests/wf/adv_prov/niaa_wf.cwl create mode 100644 tests/wf/adv_prov/niaa_wf_job.yml create mode 100644 tests/wf/adv_prov/tools/combine_features.cwl create mode 100644 tests/wf/adv_prov/tools/combine_inputs.py create mode 100644 tests/wf/adv_prov/tools/combine_labels.cwl create mode 100644 tests/wf/adv_prov/tools/combine_labels.py create mode 100644 tests/wf/adv_prov/tools/decompress.cwl create mode 100644 tests/wf/adv_prov/tools/dssp.cwl create mode 100644 tests/wf/adv_prov/tools/dssp_RASA.py create mode 100644 tests/wf/adv_prov/tools/emulated_model.py create mode 100644 tests/wf/adv_prov/tools/epitope_annotation_pipeline.py create mode 100644 tests/wf/adv_prov/tools/epitope_annotations.cwl create mode 100644 tests/wf/adv_prov/tools/get_pc7_inputs.py create mode 100644 tests/wf/adv_prov/tools/get_psp19_inputs.py create mode 100644 tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl create mode 100644 tests/wf/adv_prov/tools/pc7_inputs.cwl create mode 100644 tests/wf/adv_prov/tools/pdb_batch_download.cwl create mode 100644 tests/wf/adv_prov/tools/pdb_batch_download.sh create mode 100644 tests/wf/adv_prov/tools/pdb_query.cwl create mode 100644 tests/wf/adv_prov/tools/pdb_query.py create mode 100644 tests/wf/adv_prov/tools/ppi_annotations.cwl create mode 100644 tests/wf/adv_prov/tools/ppi_annotations.py create mode 100644 tests/wf/adv_prov/tools/process_sabdab.cwl create mode 100644 tests/wf/adv_prov/tools/process_sabdab_summary.py create mode 100644 tests/wf/adv_prov/tools/psp19_inputs.cwl create mode 100644 tests/wf/adv_prov/tools/train_epitope_model.cwl diff --git a/MANIFEST.in b/MANIFEST.in index f314e9ae2..0939a4cc2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,6 +9,9 @@ include tests/loop/* include tests/tmp1/tmp2/tmp3/.gitkeep include tests/tmp4/alpha/* include tests/wf/* +include tests/wf/adv_prov/* +include tests/wf/adv_prov/data/* +include tests/wf/adv_prov/tools/* include tests/wf/operation/* include tests/override/* include tests/reloc/*.cwl diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 0c1445cd2..158144a64 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -404,10 +404,16 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st # Transfer SCHEMA annotations to provenance for s in schema_annotations: if "additionalType" in s: - additional_type = cast(str, schema_annotations[s]).split(sep="/")[ - -1 - ] # find better method? - file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) + atype = schema_annotations[s] + if isinstance(atype, str): + additional_type = atype.split(sep="/")[-1] # find better method? + file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) + else: + for a_entry in cast(List[str], atype): + additional_type = a_entry.split(sep="/")[ + -1 + ] # find better method? + file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) else: file_entity = self._add_nested_annotations( s, schema_annotations[s], file_entity diff --git a/tests/test_provenance.py b/tests/test_provenance.py index cfb80ccb8..a801d2eeb 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -103,6 +103,27 @@ def test_revsort_label_annotations(tmp_path: Path) -> None: ) +def test_advanced_prov_annotations(tmp_path: Path) -> None: + """Pass through of advanced input annotations.""" + base_path = cwltool( + tmp_path, + get_data("tests/wf/adv_prov/niaa_wf.cwl"), + get_data("tests/wf/adv_prov/niaa_wf_job.yml"), + ) + prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt" + arcp_root = find_arcp(base_path) + g = Graph() + with open(prov_file, "rb") as f: + g.parse(file=f, format="nt", publicID=arcp_root) + mime_having_objects = list(g.subjects(SCHEMA.encodingFormat)) + assert len(mime_having_objects) == 8 + # for obj in mime_having_objects: + # assert ( + # cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value + # == "https://www.iana.org/assignments/media-types/text/plain" + # ) + + @needs_docker def test_nested_workflow(tmp_path: Path) -> None: check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True) diff --git a/tests/wf/adv_prov/data/pdb_query.json b/tests/wf/adv_prov/data/pdb_query.json new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv b/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/model_example_params.json b/tests/wf/adv_prov/model_example_params.json new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/niaa_wf.cwl b/tests/wf/adv_prov/niaa_wf.cwl new file mode 100644 index 000000000..fc45dd88d --- /dev/null +++ b/tests/wf/adv_prov/niaa_wf.cwl @@ -0,0 +1,186 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: Workflow + +intent: [ edam:operation_2423 ] # Prediction ope +doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction." + +requirements: + ScatterFeatureRequirement: {} + StepInputExpressionRequirement: {} + SubworkflowFeatureRequirement: {} + +inputs: + sabdab_summary: + type: File + format: iana:text/tab-separated-values + doc: "SAbDAb Summary metadata about all structures in the database." + biodl_train_dataset: + type: File + format: iana:text/csv + doc: "BioDL training dataset containing PPI interactions" + biodl_test_dataset: + type: File + doc: "BioDL test dataset with PPI interactions." + hhblits_db: + type: Directory + doc: "Reference database for HHblits" + hhblits_db_name: + type: string + doc: "Name of hhblits reference database" + pdb_search_api_query: + type: File + format: iana:application/json + doc: "Structured query for PDB API." + +outputs: + model_output: + type: File + outputSource: train_epitope_prediction_model/train_log + doc: "Output of the prediction model." + +steps: + run_pdb_query: + in: + pdb_search_query: pdb_search_api_query + out: + [ processed_response ] + run: ./tools/pdb_query.cwl + doc: | + Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements. + See https://search.rcsb.org/index.html#search-api for a tutorial. + + download_pdb_files: + in: + input_file: run_pdb_query/processed_response + mmcif_format: { default: True } + pdb_format: { default: True } + out: + [ pdb_files ] + run: ./tools/pdb_batch_download.cwl + + decompress_pdb_files: + in: + pdb_archives: download_pdb_files/pdb_files + out: [ cifs, pdbs ] + run: ./tools/decompress.cwl + doc: "Decompress files using gzip" + + generate_dssp_labels: + in: + pdb_files: decompress_pdb_files/pdbs # change this later + rsa_cutoff: { default : 0.06 } + out: [ dssp_output_files ] + run: ./tools/dssp.cwl + doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files." + + generate_ppi_labels: + in: + mmcif_files: decompress_pdb_files/cifs + train_dataset: biodl_train_dataset + test_dataset: biodl_test_dataset + out: [ ppi_fasta_files ] + run: ./tools/ppi_annotations.cwl + doc: "Extract ppi annotations from BioDL. This step is partly emulated." + + preprocess_sabdab_data: + doc: "Extract antigen chains from SAbDab summary file." + in: + sabdab_summary: sabdab_summary + out: [ processed_summary ] + run: ./tools/process_sabdab.cwl + + generate_epitope_labels: + in: + mmcif_files: decompress_pdb_files/cifs + sabdab_processed: preprocess_sabdab_data/processed_summary + out: [ epitope_fasta_dir ] + run: ./tools/epitope_annotations.cwl + doc: "Extract epitope annotations from PDB files." + + combine_labels: + doc: "Combine labels into 1 file per protein sequence." + run: ./tools/combine_labels.cwl + in: + epitope_directory: generate_epitope_labels/epitope_fasta_dir + ppi_directory: generate_ppi_labels/ppi_fasta_files + dssp_directory: generate_dssp_labels/dssp_output_files + out: [ labels_combined ] + + generate_pc7: + doc: Calculate PC7 features for each residue in each protein sequence. + run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input + in: + fasta: generate_ppi_labels/ppi_fasta_files + out: [ pc7_features ] + + generate_psp19: + label: Calculate PSP19 features for each residue in each protein sequence. + run: ./tools/psp19_inputs.cwl + in: + fasta: generate_ppi_labels/ppi_fasta_files + out: [ psp19_features ] + + generate_hhm: + in: + query_sequences: + source: generate_ppi_labels/ppi_fasta_files # type Directory + valueFrom: $(self.listing) # here type Directory is converted to File array + hhblits_db: hhblits_db + hhblits_db_name: hhblits_db_name + hhblits_n_iterations: { default: 1 } + out: [ hhm_file_array ] + run: + class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File + inputs: + query_sequences: File[] + hhblits_db: Directory + hhblits_db_name: string + hhblits_n_iterations: int + outputs: + hhm_file_array: + type: File[] + outputSource: run_hhblits/hhm + steps: + run_hhblits: + in: + protein_query_sequence: query_sequences + database: hhblits_db + database_name: hhblits_db_name + n_iterations: hhblits_n_iterations + out: [ hhm ] + scatter: protein_query_sequence + run: ./tools/hhm_inputs_scatter.cwl + combine_features: + in: + input_sequences: generate_ppi_labels/ppi_fasta_files + pc7_features: generate_pc7/pc7_features + psp19_features: generate_psp19/psp19_features + hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory + out: [ combined_features ] + run: ./tools/combine_features.cwl + + train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow. + in: # in the real workflow, the configuration file would be generated as part of the workflow as well + input_features: combine_features/combined_features + input_labels: combine_labels/labels_combined + out: [ train_log ] + run: ./tools/train_epitope_model.cwl + doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet." + +$namespaces: + iana: "https://www.iana.org/assignments/media-types/" + s: "https://schema.org/" + edam: "http://edamontology.org/" + cwlprov: "https://w3id.org/cwl/prov#" + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl + +s:author: +- s:name: "Renske de Wit" + s:identifier: https://orcid.org/0000-0003-0902-0086 +- s:name: "Katharina Waury" +s:license: https://spdx.org/licenses/Apache-2.0 diff --git a/tests/wf/adv_prov/niaa_wf_job.yml b/tests/wf/adv_prov/niaa_wf_job.yml new file mode 100644 index 000000000..787274ece --- /dev/null +++ b/tests/wf/adv_prov/niaa_wf_job.yml @@ -0,0 +1,78 @@ +cwlprov:prov: + sabdab_search: + s:additionalType: s:SearchAction + s:query: "All structures" + s:endTime: 2022-05-27 + s:object: + s:name: "Structural Antibody Database" + s:citation: + s:identifier: https://doi.org/10.1093/nar/gkab1050 + s:result: sabdab_summary + s:description: "Search Action for metadata on antibody-antigen complexes in SAbDab" + + +pdb_search_api_query: + class: File + location: ./data/pdb_query.json + format: iana:application/json + s:description: "Input query for PDB search API." + s:additionalType: + - edam:data_3786 # Query script + +sabdab_summary: + class: File + path: ./data/sabdab_summary_all_20220527.tsv + format: iana:text/tab-separated-values + s:description: "Summary file downloaded from SAbDAb database, containing metadata for all structures." + s:additionalType: + - edam:data_2080 # database search results + - s:Dataset + + +biodl_train_dataset: + class: File + path: data/prepared_biolip_win_p_training.csv + #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_training.csv + format: iana:text/csv + s:description: "BioDL training set containing PPI annotations for protein sequences (UniProt IDs)" + s:name: "BioDL training dataset" + s:citation: + s:identifier: https://doi.org/10.1093/bioinformatics/btac071 + s:additionalType: + - s:Dataset + - edam:data_1277 # protein features + +biodl_test_dataset: + class: File + path: data/prepared_biolip_win_p_testing.csv + #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_testing.csv + s:description: "BioDL test set containing PPI annotations for protein sequences (UniProt IDs)." + s:name: "BioDL test dataset" + s:citation: + s:identifier: https://doi.org/10.1093/bioinformatics/btac071 + s:additionalType: + - s:Dataset + - edam:data_1277 # protein features + +hhblits_db: + class: Directory + location: ../hhblits/databases + s:citation: + s:identifier: https://doi.org/10.1038/nmeth.1818 + s:name: "pdb70" + s:description: "Directory containing HHBlits reference database." + s:additionalType: + - s:Dataset + - edam:data_0955 # data index + +hhblits_db_name: pdb70 +hhblits_n_iterations: 1 + +s:description: "Demonstration run of epitope prediction workflow. Some steps are emulated, so the results of the workflow are not yet biologically meaningful." + +$namespaces: + iana: "https://www.iana.org/assignments/media-types/" + s: "https://schema.org/" + edam: "http://edamontology.org/" + cwlprov: "https://w3id.org/cwl/prov#" + diff --git a/tests/wf/adv_prov/tools/combine_features.cwl b/tests/wf/adv_prov/tools/combine_features.cwl new file mode 100644 index 000000000..1cf62735d --- /dev/null +++ b/tests/wf/adv_prov/tools/combine_features.cwl @@ -0,0 +1,88 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool +baseCommand: bash # python3 + +label: Combine input features + +doc: | + "Combines the input features for each protein sequence into 1 file per sequence. Output is stored in a new directory." + +hints: + # DockerRequirement: + # dockerPull: amancevice/pandas:1.3.4-slim + SoftwareRequirement: + packages: + numpy: + specs: [ https://anaconda.org/conda-forge/numpy ] + version: [ "1.21.4" ] + pandas: + specs: [ https://anaconda.org/conda-forge/pandas ] + version: [ "1.3.4" ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: + listing: | + ${ + return [{"entry": {"class": "Directory", "basename": "hhm_features_dir", "listing": inputs.hhm_features}, "writable": true}] + } + +arguments: + # - $(inputs.script.path) + # - $(inputs.input_sequences.path) + # - "hhm_features_dir" + # - $(inputs.pc7_features.path) + # - $(inputs.psp19_features.path) + # - "--outdir" + # - ./$(inputs.outdir_name) # An output directory will be created in current working directory + - -c + - | + set -ex + mkdir $(inputs.outdir_name) + touch $(inputs.outdir_name)/$(inputs.input_sequences.basename) + + +inputs: + script: + type: File + default: + class: File + location: ./combine_inputs.py + input_sequences: + type: Directory + # default: + # class: Directory + # location: ../data/test_set/ppi_fasta # delete this later + hhm_features: + type: File[] + # default: + # - class: File + # location: ../final_test_run/2HKF_P.hhm + # - class: File + # location: ../final_test_run/4W6W_A.hhm + # - class: File + # location: ../final_test_run/4W6X_A.hhm + # - class: File + # location: ../final_test_run/4W6Y_A.hhm + pc7_features: + type: Directory + # default: + # class: Directory + # location: ../final_test_run/pc7_features + psp19_features: + type: Directory + # default: + # class: Directory + # location: ../final_test_run/psp19_features + outdir_name: + type: string + default: "input_features" + +outputs: + combined_features: + type: Directory + outputBinding: + glob: $(inputs.outdir_name) + diff --git a/tests/wf/adv_prov/tools/combine_inputs.py b/tests/wf/adv_prov/tools/combine_inputs.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/combine_labels.cwl b/tests/wf/adv_prov/tools/combine_labels.cwl new file mode 100644 index 000000000..157e9fd7e --- /dev/null +++ b/tests/wf/adv_prov/tools/combine_labels.cwl @@ -0,0 +1,63 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: bash # python3 + +# hints: +# DockerRequirement: +# dockerPull: amancevice/pandas:1.3.4-slim +# SoftwareRequirement: +# packages: +# pandas: +# specs: [ https://anaconda.org/conda-forge/pandas ] +# version: [ "1.3.4" ] +# python: +# version: [ "3.9.7" ] + +arguments: + # - $(inputs.script.path) + # - $(inputs.epitope_directory.path) + # - $(inputs.ppi_directory.path) + # - $(inputs.dssp_directory.path) + # - "--outdir" + # - $(inputs.output_directory) + - -c + - | + set -ex + mkdir $(inputs.output_directory) + touch $(inputs.output_directory)/$(inputs.epitope_directory.basename) + touch $(inputs.output_directory)/$(inputs.ppi_directory.basename) + touch $(inputs.output_directory)/$(inputs.dssp_directory.basename) + + +inputs: + script: + type: File + default: + class: File + location: ./combine_labels.py + epitope_directory: + type: Directory + doc: Directory with FASTA files with epitope annotations. + ppi_directory: + type: Directory + doc: Directory with FASTA files with PPI annotations. + dssp_directory: + type: Directory + doc: Directory with DSSP output files. + output_directory: + type: string + default: "./combined_labels" + +outputs: + labels_combined: + type: Directory + doc: "Directory with 1 file per sequence, containing label values for each residue" + outputBinding: + glob: $(inputs.output_directory) + + + + diff --git a/tests/wf/adv_prov/tools/combine_labels.py b/tests/wf/adv_prov/tools/combine_labels.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/decompress.cwl b/tests/wf/adv_prov/tools/decompress.cwl new file mode 100644 index 000000000..8c68ccb6e --- /dev/null +++ b/tests/wf/adv_prov/tools/decompress.cwl @@ -0,0 +1,32 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +requirements: + InitialWorkDirRequirement: + listing: $(inputs.pdb_archives) + +baseCommand: bash # gunzip + +arguments: + - -c + - | + set -ex; for file in *.gz; do mv \${file} \${file%%.gz}; done + +inputs: + pdb_archives: + type: File[] + # inputBinding: + # position: 0 + +outputs: + cifs: + type: File[] + outputBinding: + glob: "*.cif" + pdbs: + type: File[] + outputBinding: + glob: "*.pdb" + diff --git a/tests/wf/adv_prov/tools/dssp.cwl b/tests/wf/adv_prov/tools/dssp.cwl new file mode 100644 index 000000000..6279f2444 --- /dev/null +++ b/tests/wf/adv_prov/tools/dssp.cwl @@ -0,0 +1,102 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool +baseCommand: bash # python3 + +doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files." +intent: [ http://edamontology.org/operation_0320 ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: # the script takes a directory as input + listing: | + ${ + return [{"entry": {"class": "Directory", "basename": "pdb_source_dir", "listing": inputs.pdb_files}, "writable": true}] + } + +hints: + # DockerRequirement: + # dockerPull: biopython/biopython@sha256:437075df44b0c9b3da96f71040baef0086789de7edf73c81de4ace30a127a245 + SoftwareRequirement: + packages: + pandas: + version: [ "0.19.1" ] + specs: [ https://pypi.org/project/pandas/ ] + biopython: + specs: [ https://pypi.org/project/biopython/ ] + version: [ "1.75" ] + dssp: + specs: [ https://swift.cmbi.umcn.nl/gv/dssp/ ] + version: [ "2.0.4" ] # this version does not support mmCIF files + python: + version: [ "3.5" ] + +arguments: + # - $(inputs.script.path) + # - "pdb_source_dir" + # - "-o" + # - $(inputs.output_dir) + # - "-d" + # - $(inputs.dssp) + # - "-c" + # - $(inputs.rsa_cutoff) + - -c + - | + set -ex + mkdir $(inputs.output_dir) + touch $(inputs.output_dir)/$(inputs.pdb_files[0].nameroot) + + +inputs: + script: + type: File + default: + class: File + location: ./dssp_RASA.py + pdb_files: + type: File[] + doc: "Protein structures in PDB format." + output_dir: + type: string + default: "dssp_output" + dssp: + type: string + default: "dssp" # for newer dssp versions: mkdssp + rsa_cutoff: + type: float + default: 0.06 + doc: "Threshold exposed surface area for considering amino acids buried." + +outputs: + dssp_output_files: + type: Directory + outputBinding: + glob: $(inputs.output_dir) + +s:author: +- class: s:Person + s:name: "Renske de Wit" +s:license: https://spdx.org/licenses/Apache-2.0 +s:dateCreated: "2022-05-28" +s:mainEntity: + class: s:SoftwareApplication + s:license: https://spdx.org/licenses/Apache-2.0 + s:author: + - class: s:Person + s:name: "DS" + s:description: "Script which takes a directory of pdb files as input and calculates relative surface accessibility for each residue in the protein sequence." + s:basedOn: + - class: s:SoftwareApplication + s:name: "DSSP" + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl + + + diff --git a/tests/wf/adv_prov/tools/dssp_RASA.py b/tests/wf/adv_prov/tools/dssp_RASA.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/emulated_model.py b/tests/wf/adv_prov/tools/emulated_model.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/epitope_annotation_pipeline.py b/tests/wf/adv_prov/tools/epitope_annotation_pipeline.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/epitope_annotations.cwl b/tests/wf/adv_prov/tools/epitope_annotations.cwl new file mode 100644 index 000000000..7d744194e --- /dev/null +++ b/tests/wf/adv_prov/tools/epitope_annotations.cwl @@ -0,0 +1,100 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: bash # python3 + +intent: [ http://edamontology.org/operation_0320 ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: # the script takes a directory as input + listing: | + ${ + return [{"entry": {"class": "Directory", "basename": "mmcif_directory", "listing": inputs.mmcif_files}, "writable": true}] + } + +doc: | + Runs Python script which takes directory of mmCIF files as input and outputs directory of FASTA files with protein sequence + epitope annotations. + +hints: + # DockerRequirement: + # dockerImageId: pdbecif-pandas:20220620 + # dockerFile: | + # FROM docker.io/debian:stable-slim + # RUN apt-get update && apt-get install -y --no-install-recommends python3-pip + # RUN python3 -m pip install PDBeCif pandas + SoftwareRequirement: + packages: + pandas: + specs: [ https://anaconda.org/conda-forge/pandas ] + version: [ "1.2.4" ] + python: + version: [ "3.9.1" ] + pdbecif: + specs: [ https://pypi.org/project/PDBeCif/ ] + version: [ "1.5" ] + +arguments: + # - $(inputs.script.path) + # - "mmcif_directory" + # - $(inputs.sabdab_processed.path) + # - "--fasta_directory" + # - $(inputs.fasta_output_dir) + # - "--df_directory" + # - $(inputs.df_output_dir) + - -c + - | + mkdir $(inputs.fasta_output_dir) $(inputs.df_output_dir); + touch $(inputs.fasta_output_dir)/$(inputs.mmcif_files[0].basename).fasta + touch $(inputs.df_output_dir)/$(inputs.mmcif_files[0].basename).df + +inputs: + script: + type: File + default: + class: File + location: ./epitope_annotation_pipeline.py + mmcif_files: + type: File[] + doc: mmCIF file array + sabdab_processed: + format: iana:text/csv + type: File + doc: "table of PDB entries with associated H, L and antigen chain." + fasta_output_dir: + type: string + default: "./epitope_fasta" + df_output_dir: + type: string + default: "./epitope_df" + +outputs: + epitope_fasta_dir: + type: Directory + outputBinding: + glob: $(inputs.fasta_output_dir) + epitope_df_dir: + type: Directory + outputBinding: + glob: $(inputs.df_output_dir) + +s:dateCreated: 2022-05-30 + +s:mainEntity: + s:additionalType: s:SoftwareApplication + s:author: + - s:name: "Katharina Waury" + s:dateCreated: 2022-02-10 + s:programmingLanguage: Python + s:description: "Script which extracts epitope annotations and dataframes from mmCIF files." + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + iana: https://www.iana.org/assignments/media-types/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl diff --git a/tests/wf/adv_prov/tools/get_pc7_inputs.py b/tests/wf/adv_prov/tools/get_pc7_inputs.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/get_psp19_inputs.py b/tests/wf/adv_prov/tools/get_psp19_inputs.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl b/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl new file mode 100644 index 000000000..eaba7bf1a --- /dev/null +++ b/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl @@ -0,0 +1,70 @@ +cwlVersion: v1.2 +class: CommandLineTool +baseCommand: bash # hhblits + +doc: | + CommandLineTool for hhblits, part of HH-suite. See https://github.com/soedinglab/hh-suite for documentation. +hints: + SoftwareRequirement: + packages: + hhsuite: + specs: + - https://anaconda.org/bioconda/hhsuite + - https://bio.tools/hhsuite + version: [ "3.3.0" ] +# DockerRequirement: +# dockerPull: quay.io/biocontainers/hhsuite:3.3.0--py39pl5321h67e14b5_5 # this is the version opus-tass uses? + +inputs: + protein_query_sequence: + type: File + # format: [ + # edam:format_1929, # FASTA + # edam:format_3281, # A2M + # ] + database: Directory # too large to be included in RO, change later to type string = path to database + database_name: string + n_iterations: + type: int + default: 2 # change this to the correct value + + +arguments: + # - "-i" + # - $(inputs.protein_query_sequence.path) #$(inputs.fasta_dir.path)/$(inputs.protein_id).fasta + # - "-d" + # - $(inputs.database.path)/$(inputs.database_name) + # - "-o" + # - $(inputs.protein_query_sequence.nameroot).hhr + # - "-ohhm" + # - $(inputs.protein_query_sequence.nameroot).hhm + # - "-n" + # - $(inputs.n_iterations) + - -c + - | + set +ex + touch $(inputs.protein_query_sequence.nameroot).hhr + touch $(inputs.protein_query_sequence.nameroot).hhm + + +outputs: + hhm: + type: File + outputBinding: + glob: "*.hhm" + + +s:author: # Creator of this CWL document +- s:identifier: https://orcid.org/0000-0003-0902-0086 + +s:license: Apache-2.0 + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl + + diff --git a/tests/wf/adv_prov/tools/pc7_inputs.cwl b/tests/wf/adv_prov/tools/pc7_inputs.cwl new file mode 100644 index 000000000..003fef3cf --- /dev/null +++ b/tests/wf/adv_prov/tools/pc7_inputs.cwl @@ -0,0 +1,63 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool +# hints: +# DockerRequirement: +# dockerPull: amancevice/pandas:1.3.4-slim # Script needs numpy which is a dependency of pandas +# SoftwareRequirement: +# packages: +# numpy: +# specs: [ https://anaconda.org/conda-forge/numpy ] + +baseCommand: bash # python3 + +doc: PC7 features are assigned to each residue in each protein sequence. Output is a directory of files (1 per sequence). +# intent: [ http://edamontology.org/operation_0361 ] + +inputs: + script: + type: File + default: + class: File + location: ./get_pc7_inputs.py + # inputBinding: { position: 1 } + fasta: + type: Directory + format: edam:format_2200 # fasta-like (text) + # inputBinding: + # position: 2 + outdir: + type: string + # inputBinding: + # position: 3 + # prefix: -o + default: "pc7_features" + +arguments: + - -c + - | + mkdir $(inputs.outdir) + touch $(inputs.outdir)/$(inputs.fasta.nameroot) + +outputs: + pc7_features: + type: Directory + outputBinding: + glob: $(inputs.outdir) + +s:mainEntity: # add that this is a commandlinetool + s:programmingLanguage: Python + s:codeRepository: https://github.com/RenskeW/cwl-epitope/blob/b5e31d42006fd7003716f57963646d47d1154549/tools/get_pc7_inputs.py + s:isBasedOn: + - s:additionalType: s:SoftwareApplication + s:name: OPUS-TASS + s:identifier: https://bio.tools/opus-tass + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl diff --git a/tests/wf/adv_prov/tools/pdb_batch_download.cwl b/tests/wf/adv_prov/tools/pdb_batch_download.cwl new file mode 100644 index 000000000..97b3bde3b --- /dev/null +++ b/tests/wf/adv_prov/tools/pdb_batch_download.cwl @@ -0,0 +1,88 @@ +#!/usr/env/bin cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: touch # bash + +doc: "Download files from the PDB in a specific format." + +intent: [ http://edamontology.org/operation_2422 ] +requirements: + NetworkAccess: + networkAccess: True + +inputs: + script: + type: File + # inputBinding: + # position: 1 + default: + class: File + location: ./pdb_batch_download.sh + input_file: + doc: "Comma-separated .txt file with pdb entries to download" + type: File + format: iana:text/csv + # inputBinding: + # position: 3 + # prefix: "-f" + mmcif_format: # The last arguments specify the format in which entries will be downloaded + type: boolean + # inputBinding: + # position: 4 + # prefix: "-c" # .cif.gz + default: True + pdb_format: + type: boolean + # inputBinding: + # position: 5 + # prefix: "-p" # .pdb.gz + default: False + pdb1_format: + type: boolean + # inputBinding: + # position: 6 + # prefix: "-a" # .pdb1.gz + default: False + xml_format: + type: boolean + # inputBinding: + # position: 7 + # prefix: "-x" # .xml.gz + default: False + sfcif_format: + type: boolean + # inputBinding: + # position: 8 + # prefix: "-s" # .sf.cif.gz + default: False + mr_format: + type: boolean + # inputBinding: + # position: 9 + # prefix: "-m" # .mr.gz + default: False + mr_str_format: + type: boolean + # inputBinding: + # position: 10 + # prefix: "-r" # .mr.str.gz + default: False + +arguments: + - $(inputs.input_file.nameroot).1.cif.gz + - $(inputs.input_file.nameroot).2.cif.gz + - $(inputs.input_file.nameroot).1.pdb.gz + - $(inputs.input_file.nameroot).2.pdb.gz + +outputs: + pdb_files: + type: File[] + outputBinding: + glob: "*.gz" + doc: "Downloaded files" + + +$namespaces: + iana: https://www.iana.org/assignments/media-types/ diff --git a/tests/wf/adv_prov/tools/pdb_batch_download.sh b/tests/wf/adv_prov/tools/pdb_batch_download.sh new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/pdb_query.cwl b/tests/wf/adv_prov/tools/pdb_query.cwl new file mode 100644 index 000000000..62d34c584 --- /dev/null +++ b/tests/wf/adv_prov/tools/pdb_query.cwl @@ -0,0 +1,76 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: cat # python3 + +requirements: + NetworkAccess: + networkAccess: True + +intent: [ http://edamontology.org/operation_2421 ] # Database search + +hints: + # DockerRequirement: + # dockerPull: nyurik/alpine-python3-requests@sha256:e0553236e3ebaa240752b41b8475afb454c5ab4c17eb023a2a904637eda16cf6 + SoftwareRequirement: + packages: + python3: + version: [ 3.9.5 ] + requests: + version: [ 2.25.1 ] + +arguments: + # - $(inputs.script.path) + - $(inputs.pdb_search_query.path) + # - "--outpath" + # - $(inputs.return_file) + +stdout: $(inputs.return_file) + +inputs: + script: + type: File + default: + class: File + location: ./pdb_query.py + pdb_search_query: + type: File + label: Query for PDB search API in json format + format: iana:application/json + return_file: + type: string + label: Path to output file + default: "./pdb_ids.txt" + doc: "Comma-separated text file with PDB ids" + +outputs: + processed_response: + type: File + format: iana:text/csv + doc: Comma-separated text file with returned identifiers from PDB search API + outputBinding: + glob: $(inputs.return_file) + +# label: Query PDB search API and store output in comma-separated text file. + +doc: | + This tool invokes a Python script which uses requests library to query PDB search API and return a comma-separated file of identifiers returned by the API. + More information about PDB search API: https://search.rcsb.org/index.html + + +$namespaces: + iana: https://www.iana.org/assignments/media-types/ + s: https://www.schema.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf + +s:author: +- s:identifier: https://orcid.org/0000-0003-0902-0086 + +s:mainEntity: + s:author: + - s:identifier: https://orcid.org/0000-0003-0902-0086 + diff --git a/tests/wf/adv_prov/tools/pdb_query.py b/tests/wf/adv_prov/tools/pdb_query.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/ppi_annotations.cwl b/tests/wf/adv_prov/tools/ppi_annotations.cwl new file mode 100644 index 000000000..2ce630b18 --- /dev/null +++ b/tests/wf/adv_prov/tools/ppi_annotations.cwl @@ -0,0 +1,77 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: bash # python3 + +doc: "Extract PPI annotations from BioDL." +intent: [ http://edamontology.org/operation_0320 ] + +hints: + # DockerRequirement: + # dockerImageId: pdbecif-pandas:20220620 + # dockerFile: | + # FROM docker.io/debian:stable-slim + # RUN apt-get update && apt-get install -y --no-install-recommends python3-pip + # RUN python3 -m pip install PDBeCif pandas + SoftwareRequirement: + packages: + pandas: + specs: [ https://anaconda.org/conda-forge/pandas ] + version: [ "1.2.4" ] + python: + version: [ "3.9.1" ] + pdbecif: + specs: [ https://pypi.org/project/PDBeCif/ ] + version: [ "1.5" ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: # the script takes a directory as input + listing: + - entry: | + ${ return {"class": "Directory", + "listing": inputs.mmcif_files }; + } + entryname: mmcif_directory + # writable: true + +inputs: + script: + type: File + default: + class: File + location: ./ppi_annotations.py + mmcif_files: # the download leaves us with an array of files, but script takes type Directory --> InitialWorkdirRequirement + type: File[] + train_dataset: + type: File + doc: "BioDL training set" + test_dataset: + type: File + doc: "BioDL test set" + output_directory_name: + type: string + default: "ppi_fasta" + +arguments: +# - $(inputs.script.path) +# - "mmcif_directory" +# - $(inputs.train_dataset.path) +# - $(inputs.test_dataset.path) +# - "--outdir" +#- $(inputs.output_directory) +- -c +- | + set -ex + mkdir $(inputs.output_directory_name) + touch $(inputs.output_directory_name)/$(inputs.train_dataset.nameroot) + touch $(inputs.output_directory_name)/$(inputs.test_dataset.nameroot) + + +outputs: + ppi_fasta_files: + type: Directory + outputBinding: + glob: $(inputs.output_directory_name) diff --git a/tests/wf/adv_prov/tools/ppi_annotations.py b/tests/wf/adv_prov/tools/ppi_annotations.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/process_sabdab.cwl b/tests/wf/adv_prov/tools/process_sabdab.cwl new file mode 100644 index 000000000..2f627fa15 --- /dev/null +++ b/tests/wf/adv_prov/tools/process_sabdab.cwl @@ -0,0 +1,67 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +doc: "Preprocess SAbDab summary file." +intent: [ http://edamontology.org/operation_2409 ] + +hints: + # DockerRequirement: + # dockerPull: amancevice/pandas:1.3.4-slim + SoftwareRequirement: + packages: + python: + version: [ "3.9.7" ] + pandas: + version: [ "1.3.4" ] + +baseCommand: cat # python3 + +arguments: +# - $(inputs.script.path) + - $(inputs.sabdab_summary.path) +# - "-o" +# - $(inputs.results_name) + +stdout: $(inputs.results_name) + +inputs: + script: + type: File + default: + class: File + location: ./process_sabdab_summary.py + sabdab_summary: + type: File + label: Summary file downloaded from SAbDab. + format: iana:text/tab-separated-values + results_name: + type: string + label: Name of output file in which processed results are stored. + default: "SAbDab_protein_antigens_PDB_chains.csv" + +outputs: + processed_summary: + type: File + format: iana:text/csv + outputBinding: + glob: $(inputs.results_name) + +s:author: +- class: s:Person + s:name: "Renske de Wit" + s:identifier: https://orcid.org/0000-0003-0902-0086 +s:license: https://spdx.org/licenses/Apache-2.0 + +s:mainEntity: + class: s:SoftwareApplication + s:license: https://spdx.org/licenses/Apache-2.0 + s:author: + - class: s:Person + s:name: "Katharina Waury" + s:identifier: + +$namespaces: + iana: "https://www.iana.org/assignments/media-types/" + s: "https://schema.org/" diff --git a/tests/wf/adv_prov/tools/process_sabdab_summary.py b/tests/wf/adv_prov/tools/process_sabdab_summary.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/psp19_inputs.cwl b/tests/wf/adv_prov/tools/psp19_inputs.cwl new file mode 100644 index 000000000..0b34196ae --- /dev/null +++ b/tests/wf/adv_prov/tools/psp19_inputs.cwl @@ -0,0 +1,54 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool +# hints: +# DockerRequirement: +# dockerPull: amancevice/pandas:1.3.4-slim +# SoftwareRequirement: +# packages: +# numpy: +# specs: [ https://anaconda.org/conda-forge/numpy ] + # python: + # version: + +baseCommand: bash # python3 + +inputs: + script: + type: File + default: + class: File + location: ./get_psp19_inputs.py + # inputBinding: + # position: 1 + fasta: + type: Directory + format: edam:format_2200 + # inputBinding: + # position: 2 + outdir: + type: string + # inputBinding: + # position: 3 + # prefix: -o + default: "psp19_features" + +arguments: + - -c + - | + set -ex + mkdir $(inputs.outdir) + touch $(inputs.outdir)/$(inputs.fasta.nameroot) + +outputs: + psp19_features: + type: Directory + outputBinding: + glob: $(inputs.outdir) + +$namespaces: + edam: http://edamontology.org/ + +$schemas: +- https://edamontology.org/EDAM_1.25.owl diff --git a/tests/wf/adv_prov/tools/train_epitope_model.cwl b/tests/wf/adv_prov/tools/train_epitope_model.cwl new file mode 100644 index 000000000..141180356 --- /dev/null +++ b/tests/wf/adv_prov/tools/train_epitope_model.cwl @@ -0,0 +1,68 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: python3 + +doc: "Model training." +intent: [ http://edamontology.org/operation_2423 ] +hints: + SoftwareRequirement: + packages: + python: + version: [ "3.9" ] + tqdm: + specs: [ https://pypi.org/project/tqdm/ ] + version: [ "4.64.0" ] + tensorflow-gpu: + specs: [ https://pypi.org/project/tensorflow-gpu/ ] + version: [ 2.9.1 ] + tensorflow-addons: + specs: [ https://pypi.org/project/tensorflow-addons/ ] + version: [ "0.17.1" ] + numpy: + version: [ "1.21.5" ] + click: + version: [ "8.0.4" ] + commentjson: + specs: [ https://pypi.org/project/commentjson/ ] + version: [ "0.9.0" ] + +arguments: +- $(inputs.script.path) +- $(inputs.config_file.path) +- $(inputs.input_features.path) +- $(inputs.input_labels.path) + +inputs: + script: + type: File + default: + class: File + location: ./emulated_model.py # this is a placeholder script + config_file: + type: File + default: + class: File + location: ../model_example_params.json + doc: "Configuration file used for the model. Here: standard file, but in real workflow it should be generated from previous steps." + input_features: + type: Directory + input_labels: + type: Directory + + +stdout: "training_log.txt" + +outputs: + train_log: + type: stdout + doc: "Output of the model containing predictions and/or performance on the test set." + + + + + + +