From 5466f95443b4bd0c2918b34f2ffb23b38b18c0a6 Mon Sep 17 00:00:00 2001
From: "Michael R. Crusoe" <michael.crusoe@gmail.com>
Date: Thu, 15 Dec 2022 17:33:44 +0100
Subject: [PATCH] start of a more advanced test

---
 MANIFEST.in                                   |   3 +
 cwltool/provenance_profile.py                 |  14 +-
 tests/test_provenance.py                      |  21 ++
 tests/wf/adv_prov/data/pdb_query.json         |   0
 .../data/prepared_biolip_win_p_testing.csv    |   0
 .../data/prepared_biolip_win_p_training.csv   |   0
 .../data/sabdab_summary_all_20220527.tsv      |   0
 tests/wf/adv_prov/model_example_params.json   |   0
 tests/wf/adv_prov/niaa_wf.cwl                 | 186 ++++++++++++++++++
 tests/wf/adv_prov/niaa_wf_job.yml             |  78 ++++++++
 tests/wf/adv_prov/tools/combine_features.cwl  |  88 +++++++++
 tests/wf/adv_prov/tools/combine_inputs.py     |   0
 tests/wf/adv_prov/tools/combine_labels.cwl    |  63 ++++++
 tests/wf/adv_prov/tools/combine_labels.py     |   0
 tests/wf/adv_prov/tools/decompress.cwl        |  32 +++
 tests/wf/adv_prov/tools/dssp.cwl              | 102 ++++++++++
 tests/wf/adv_prov/tools/dssp_RASA.py          |   0
 tests/wf/adv_prov/tools/emulated_model.py     |   0
 .../tools/epitope_annotation_pipeline.py      |   0
 .../wf/adv_prov/tools/epitope_annotations.cwl | 100 ++++++++++
 tests/wf/adv_prov/tools/get_pc7_inputs.py     |   0
 tests/wf/adv_prov/tools/get_psp19_inputs.py   |   0
 .../wf/adv_prov/tools/hhm_inputs_scatter.cwl  |  70 +++++++
 tests/wf/adv_prov/tools/pc7_inputs.cwl        |  63 ++++++
 .../wf/adv_prov/tools/pdb_batch_download.cwl  |  88 +++++++++
 tests/wf/adv_prov/tools/pdb_batch_download.sh |   0
 tests/wf/adv_prov/tools/pdb_query.cwl         |  76 +++++++
 tests/wf/adv_prov/tools/pdb_query.py          |   0
 tests/wf/adv_prov/tools/ppi_annotations.cwl   |  77 ++++++++
 tests/wf/adv_prov/tools/ppi_annotations.py    |   0
 tests/wf/adv_prov/tools/process_sabdab.cwl    |  67 +++++++
 .../adv_prov/tools/process_sabdab_summary.py  |   0
 tests/wf/adv_prov/tools/psp19_inputs.cwl      |  54 +++++
 .../wf/adv_prov/tools/train_epitope_model.cwl |  68 +++++++
 34 files changed, 1246 insertions(+), 4 deletions(-)
 create mode 100644 tests/wf/adv_prov/data/pdb_query.json
 create mode 100644 tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv
 create mode 100644 tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv
 create mode 100644 tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv
 create mode 100644 tests/wf/adv_prov/model_example_params.json
 create mode 100644 tests/wf/adv_prov/niaa_wf.cwl
 create mode 100644 tests/wf/adv_prov/niaa_wf_job.yml
 create mode 100644 tests/wf/adv_prov/tools/combine_features.cwl
 create mode 100644 tests/wf/adv_prov/tools/combine_inputs.py
 create mode 100644 tests/wf/adv_prov/tools/combine_labels.cwl
 create mode 100644 tests/wf/adv_prov/tools/combine_labels.py
 create mode 100644 tests/wf/adv_prov/tools/decompress.cwl
 create mode 100644 tests/wf/adv_prov/tools/dssp.cwl
 create mode 100644 tests/wf/adv_prov/tools/dssp_RASA.py
 create mode 100644 tests/wf/adv_prov/tools/emulated_model.py
 create mode 100644 tests/wf/adv_prov/tools/epitope_annotation_pipeline.py
 create mode 100644 tests/wf/adv_prov/tools/epitope_annotations.cwl
 create mode 100644 tests/wf/adv_prov/tools/get_pc7_inputs.py
 create mode 100644 tests/wf/adv_prov/tools/get_psp19_inputs.py
 create mode 100644 tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl
 create mode 100644 tests/wf/adv_prov/tools/pc7_inputs.cwl
 create mode 100644 tests/wf/adv_prov/tools/pdb_batch_download.cwl
 create mode 100644 tests/wf/adv_prov/tools/pdb_batch_download.sh
 create mode 100644 tests/wf/adv_prov/tools/pdb_query.cwl
 create mode 100644 tests/wf/adv_prov/tools/pdb_query.py
 create mode 100644 tests/wf/adv_prov/tools/ppi_annotations.cwl
 create mode 100644 tests/wf/adv_prov/tools/ppi_annotations.py
 create mode 100644 tests/wf/adv_prov/tools/process_sabdab.cwl
 create mode 100644 tests/wf/adv_prov/tools/process_sabdab_summary.py
 create mode 100644 tests/wf/adv_prov/tools/psp19_inputs.cwl
 create mode 100644 tests/wf/adv_prov/tools/train_epitope_model.cwl

diff --git a/MANIFEST.in b/MANIFEST.in
index f314e9ae2..0939a4cc2 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -9,6 +9,9 @@ include tests/loop/*
 include tests/tmp1/tmp2/tmp3/.gitkeep
 include tests/tmp4/alpha/*
 include tests/wf/*
+include tests/wf/adv_prov/*
+include tests/wf/adv_prov/data/*
+include tests/wf/adv_prov/tools/*
 include tests/wf/operation/*
 include tests/override/*
 include tests/reloc/*.cwl
diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py
index 0c1445cd2..158144a64 100644
--- a/cwltool/provenance_profile.py
+++ b/cwltool/provenance_profile.py
@@ -404,10 +404,16 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st
         # Transfer SCHEMA annotations to provenance
         for s in schema_annotations:
             if "additionalType" in s:
-                additional_type = cast(str, schema_annotations[s]).split(sep="/")[
-                    -1
-                ]  # find better method?
-                file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
+                atype = schema_annotations[s]
+                if isinstance(atype, str):
+                    additional_type = atype.split(sep="/")[-1]  # find better method?
+                    file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
+                else:
+                    for a_entry in cast(List[str], atype):
+                        additional_type = a_entry.split(sep="/")[
+                            -1
+                        ]  # find better method?
+                        file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
             else:
                 file_entity = self._add_nested_annotations(
                     s, schema_annotations[s], file_entity
diff --git a/tests/test_provenance.py b/tests/test_provenance.py
index cfb80ccb8..a801d2eeb 100644
--- a/tests/test_provenance.py
+++ b/tests/test_provenance.py
@@ -103,6 +103,27 @@ def test_revsort_label_annotations(tmp_path: Path) -> None:
         )
 
 
+def test_advanced_prov_annotations(tmp_path: Path) -> None:
+    """Pass through of advanced input annotations."""
+    base_path = cwltool(
+        tmp_path,
+        get_data("tests/wf/adv_prov/niaa_wf.cwl"),
+        get_data("tests/wf/adv_prov/niaa_wf_job.yml"),
+    )
+    prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
+    arcp_root = find_arcp(base_path)
+    g = Graph()
+    with open(prov_file, "rb") as f:
+        g.parse(file=f, format="nt", publicID=arcp_root)
+    mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
+    assert len(mime_having_objects) == 8
+    # for obj in mime_having_objects:
+    #     assert (
+    #         cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
+    #         == "https://www.iana.org/assignments/media-types/text/plain"
+    #     )
+
+
 @needs_docker
 def test_nested_workflow(tmp_path: Path) -> None:
     check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True)
diff --git a/tests/wf/adv_prov/data/pdb_query.json b/tests/wf/adv_prov/data/pdb_query.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv b/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/model_example_params.json b/tests/wf/adv_prov/model_example_params.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/niaa_wf.cwl b/tests/wf/adv_prov/niaa_wf.cwl
new file mode 100644
index 000000000..fc45dd88d
--- /dev/null
+++ b/tests/wf/adv_prov/niaa_wf.cwl
@@ -0,0 +1,186 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: Workflow
+
+intent: [ edam:operation_2423 ]  # Prediction ope
+doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction."
+
+requirements:
+  ScatterFeatureRequirement: {}
+  StepInputExpressionRequirement: {}
+  SubworkflowFeatureRequirement: {}
+
+inputs: 
+  sabdab_summary: 
+    type: File
+    format: iana:text/tab-separated-values
+    doc: "SAbDAb Summary metadata about all structures in the database."
+  biodl_train_dataset: 
+    type: File
+    format: iana:text/csv
+    doc: "BioDL training dataset containing PPI interactions"
+  biodl_test_dataset: 
+    type: File
+    doc: "BioDL test dataset with PPI interactions."
+  hhblits_db:
+    type: Directory
+    doc: "Reference database for HHblits"
+  hhblits_db_name: 
+    type: string
+    doc: "Name of hhblits reference database"
+  pdb_search_api_query: 
+    type: File
+    format: iana:application/json
+    doc: "Structured query for PDB API."
+
+outputs: 
+  model_output:
+    type: File
+    outputSource: train_epitope_prediction_model/train_log
+    doc: "Output of the prediction model."
+
+steps:
+  run_pdb_query:
+    in:
+      pdb_search_query: pdb_search_api_query
+    out:
+      [ processed_response ]
+    run: ./tools/pdb_query.cwl
+    doc: |
+      Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements.
+      See https://search.rcsb.org/index.html#search-api for a tutorial.
+
+  download_pdb_files:
+    in: 
+      input_file: run_pdb_query/processed_response 
+      mmcif_format: { default: True }
+      pdb_format: { default: True }
+    out:
+      [ pdb_files ]
+    run: ./tools/pdb_batch_download.cwl
+  
+  decompress_pdb_files:
+    in:
+      pdb_archives: download_pdb_files/pdb_files
+    out: [ cifs, pdbs ]
+    run: ./tools/decompress.cwl
+    doc: "Decompress files using gzip"
+
+  generate_dssp_labels:
+    in:
+      pdb_files: decompress_pdb_files/pdbs # change this later
+      rsa_cutoff: { default :  0.06 }
+    out: [ dssp_output_files ]
+    run: ./tools/dssp.cwl
+    doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files."
+
+  generate_ppi_labels:
+    in:
+      mmcif_files: decompress_pdb_files/cifs
+      train_dataset: biodl_train_dataset
+      test_dataset: biodl_test_dataset
+    out: [ ppi_fasta_files ]
+    run: ./tools/ppi_annotations.cwl
+    doc: "Extract ppi annotations from BioDL. This step is partly emulated."
+  
+  preprocess_sabdab_data:
+    doc: "Extract antigen chains from SAbDab summary file."
+    in:
+      sabdab_summary: sabdab_summary
+    out: [ processed_summary ]
+    run: ./tools/process_sabdab.cwl
+
+  generate_epitope_labels:
+    in: 
+      mmcif_files: decompress_pdb_files/cifs
+      sabdab_processed: preprocess_sabdab_data/processed_summary
+    out: [ epitope_fasta_dir ]
+    run: ./tools/epitope_annotations.cwl
+    doc: "Extract epitope annotations from PDB files."
+
+  combine_labels:
+    doc: "Combine labels into 1 file per protein sequence."
+    run: ./tools/combine_labels.cwl
+    in:
+      epitope_directory: generate_epitope_labels/epitope_fasta_dir
+      ppi_directory: generate_ppi_labels/ppi_fasta_files
+      dssp_directory: generate_dssp_labels/dssp_output_files
+    out: [ labels_combined ]
+  
+  generate_pc7:
+    doc: Calculate PC7 features for each residue in each protein sequence.
+    run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input
+    in: 
+      fasta: generate_ppi_labels/ppi_fasta_files 
+    out: [ pc7_features ]  
+
+  generate_psp19:
+    label: Calculate PSP19 features for each residue in each protein sequence.
+    run: ./tools/psp19_inputs.cwl
+    in:
+      fasta: generate_ppi_labels/ppi_fasta_files
+    out: [ psp19_features ]
+
+  generate_hhm:
+    in:
+      query_sequences: 
+        source: generate_ppi_labels/ppi_fasta_files # type Directory
+        valueFrom: $(self.listing) # here type Directory is converted to File array
+      hhblits_db: hhblits_db
+      hhblits_db_name: hhblits_db_name
+      hhblits_n_iterations: { default: 1 }
+    out: [ hhm_file_array ]
+    run:
+      class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File
+      inputs:
+        query_sequences: File[]
+        hhblits_db: Directory
+        hhblits_db_name: string
+        hhblits_n_iterations: int
+      outputs:
+        hhm_file_array:
+          type: File[]
+          outputSource: run_hhblits/hhm
+      steps:
+        run_hhblits:
+          in: 
+            protein_query_sequence: query_sequences
+            database: hhblits_db
+            database_name: hhblits_db_name
+            n_iterations: hhblits_n_iterations
+          out: [ hhm ]
+          scatter: protein_query_sequence
+          run: ./tools/hhm_inputs_scatter.cwl
+  combine_features:
+    in: 
+      input_sequences: generate_ppi_labels/ppi_fasta_files
+      pc7_features: generate_pc7/pc7_features
+      psp19_features: generate_psp19/psp19_features
+      hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory
+    out: [ combined_features ]
+    run: ./tools/combine_features.cwl  
+  
+  train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow.
+    in: # in the real workflow, the configuration file would be generated as part of the workflow as well
+      input_features: combine_features/combined_features
+      input_labels: combine_labels/labels_combined
+    out: [ train_log ] 
+    run: ./tools/train_epitope_model.cwl
+    doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet."  
+
+$namespaces:
+  iana: "https://www.iana.org/assignments/media-types/"
+  s: "https://schema.org/"
+  edam: "http://edamontology.org/"
+  cwlprov: "https://w3id.org/cwl/prov#"
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+- https://edamontology.org/EDAM_1.25.owl
+
+s:author:
+- s:name: "Renske de Wit"
+  s:identifier: https://orcid.org/0000-0003-0902-0086
+- s:name: "Katharina Waury"
+s:license: https://spdx.org/licenses/Apache-2.0
diff --git a/tests/wf/adv_prov/niaa_wf_job.yml b/tests/wf/adv_prov/niaa_wf_job.yml
new file mode 100644
index 000000000..787274ece
--- /dev/null
+++ b/tests/wf/adv_prov/niaa_wf_job.yml
@@ -0,0 +1,78 @@
+cwlprov:prov:
+  sabdab_search:
+    s:additionalType: s:SearchAction
+    s:query: "All structures"
+    s:endTime: 2022-05-27
+    s:object:
+      s:name: "Structural Antibody Database"
+      s:citation:
+        s:identifier: https://doi.org/10.1093/nar/gkab1050
+    s:result: sabdab_summary
+    s:description: "Search Action for metadata on antibody-antigen complexes in SAbDab"
+
+
+pdb_search_api_query:
+  class: File
+  location: ./data/pdb_query.json
+  format: iana:application/json
+  s:description: "Input query for PDB search API."
+  s:additionalType:
+  - edam:data_3786 # Query script
+
+sabdab_summary:
+  class: File
+  path: ./data/sabdab_summary_all_20220527.tsv
+  format: iana:text/tab-separated-values
+  s:description: "Summary file downloaded from SAbDAb database, containing metadata for all structures."
+  s:additionalType:
+  - edam:data_2080 # database search results
+  - s:Dataset
+      
+
+biodl_train_dataset:
+  class: File
+  path: data/prepared_biolip_win_p_training.csv
+  #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_training.csv
+  format: iana:text/csv
+  s:description: "BioDL training set containing PPI annotations for protein sequences (UniProt IDs)"
+  s:name: "BioDL training dataset"
+  s:citation:
+    s:identifier: https://doi.org/10.1093/bioinformatics/btac071
+  s:additionalType:
+  - s:Dataset
+  - edam:data_1277 # protein features
+
+biodl_test_dataset:
+  class: File
+  path: data/prepared_biolip_win_p_testing.csv
+  #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_testing.csv
+  s:description: "BioDL test set containing PPI annotations for protein sequences (UniProt IDs)."
+  s:name: "BioDL test dataset"
+  s:citation:
+    s:identifier: https://doi.org/10.1093/bioinformatics/btac071
+  s:additionalType:
+  - s:Dataset
+  - edam:data_1277 # protein features
+
+hhblits_db: 
+  class: Directory
+  location: ../hhblits/databases
+  s:citation:
+    s:identifier: https://doi.org/10.1038/nmeth.1818
+  s:name: "pdb70"
+  s:description: "Directory containing HHBlits reference database."
+  s:additionalType:
+  - s:Dataset
+  - edam:data_0955 # data index
+
+hhblits_db_name: pdb70
+hhblits_n_iterations: 1
+
+s:description: "Demonstration run of epitope prediction workflow. Some steps are emulated, so the results of the workflow are not yet biologically meaningful."
+
+$namespaces:
+  iana: "https://www.iana.org/assignments/media-types/"
+  s: "https://schema.org/"
+  edam: "http://edamontology.org/"
+  cwlprov: "https://w3id.org/cwl/prov#"
+
diff --git a/tests/wf/adv_prov/tools/combine_features.cwl b/tests/wf/adv_prov/tools/combine_features.cwl
new file mode 100644
index 000000000..1cf62735d
--- /dev/null
+++ b/tests/wf/adv_prov/tools/combine_features.cwl
@@ -0,0 +1,88 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+baseCommand: bash  # python3
+
+label: Combine input features
+
+doc: |
+  "Combines the input features for each protein sequence into 1 file per sequence. Output is stored in a new directory."
+
+hints:
+  # DockerRequirement:
+  #   dockerPull: amancevice/pandas:1.3.4-slim 
+  SoftwareRequirement:
+    packages:
+      numpy:
+        specs: [ https://anaconda.org/conda-forge/numpy ]
+        version: [ "1.21.4" ]
+      pandas:
+        specs: [ https://anaconda.org/conda-forge/pandas ]
+        version: [ "1.3.4" ]
+
+requirements:
+  InlineJavascriptRequirement: {}
+  InitialWorkDirRequirement:
+    listing: |
+      ${
+           return [{"entry": {"class": "Directory", "basename": "hhm_features_dir", "listing": inputs.hhm_features}, "writable": true}]
+       }
+
+arguments:
+ # - $(inputs.script.path)
+ # - $(inputs.input_sequences.path)
+ # - "hhm_features_dir"
+ # - $(inputs.pc7_features.path)
+ # - $(inputs.psp19_features.path)
+ # - "--outdir"
+ # - ./$(inputs.outdir_name) # An output directory will be created in current working directory
+ - -c
+ - |
+   set -ex
+   mkdir $(inputs.outdir_name)
+   touch $(inputs.outdir_name)/$(inputs.input_sequences.basename)
+
+
+inputs:
+  script:
+    type: File
+    default:
+      class: File
+      location: ./combine_inputs.py 
+  input_sequences:
+    type: Directory
+    # default:
+    #   class: Directory
+    #   location: ../data/test_set/ppi_fasta # delete this later
+  hhm_features:
+    type: File[]
+    # default:
+    # - class: File
+    #   location: ../final_test_run/2HKF_P.hhm
+    # - class: File
+    #   location: ../final_test_run/4W6W_A.hhm
+    # - class: File
+    #   location: ../final_test_run/4W6X_A.hhm
+    # - class: File
+    #   location: ../final_test_run/4W6Y_A.hhm
+  pc7_features:
+    type: Directory
+    # default:
+    #   class: Directory
+    #   location: ../final_test_run/pc7_features
+  psp19_features:
+    type: Directory
+    # default:
+    #   class: Directory
+    #   location: ../final_test_run/psp19_features
+  outdir_name:
+    type: string
+    default: "input_features"
+
+outputs:
+  combined_features:
+    type: Directory
+    outputBinding:
+      glob: $(inputs.outdir_name)
+
diff --git a/tests/wf/adv_prov/tools/combine_inputs.py b/tests/wf/adv_prov/tools/combine_inputs.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/combine_labels.cwl b/tests/wf/adv_prov/tools/combine_labels.cwl
new file mode 100644
index 000000000..157e9fd7e
--- /dev/null
+++ b/tests/wf/adv_prov/tools/combine_labels.cwl
@@ -0,0 +1,63 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+baseCommand: bash  # python3
+
+# hints:
+#   DockerRequirement:
+#     dockerPull: amancevice/pandas:1.3.4-slim
+#   SoftwareRequirement:
+#     packages:
+#       pandas:
+#         specs: [ https://anaconda.org/conda-forge/pandas ]
+#         version: [ "1.3.4" ]
+#       python:
+#         version: [ "3.9.7" ]
+
+arguments:
+ # - $(inputs.script.path)
+ # - $(inputs.epitope_directory.path)
+ # - $(inputs.ppi_directory.path)
+ # - $(inputs.dssp_directory.path)
+ # - "--outdir"
+ # - $(inputs.output_directory)
+ - -c
+ - |
+   set -ex
+   mkdir $(inputs.output_directory)
+   touch $(inputs.output_directory)/$(inputs.epitope_directory.basename)
+   touch $(inputs.output_directory)/$(inputs.ppi_directory.basename)
+   touch $(inputs.output_directory)/$(inputs.dssp_directory.basename)
+
+
+inputs:
+  script:
+    type: File
+    default:
+      class: File
+      location: ./combine_labels.py
+  epitope_directory:
+    type: Directory
+    doc: Directory with FASTA files with epitope annotations.
+  ppi_directory:
+    type: Directory
+    doc: Directory with FASTA files with PPI annotations.
+  dssp_directory:
+    type: Directory
+    doc: Directory with DSSP output files.
+  output_directory:
+    type: string
+    default: "./combined_labels"
+
+outputs:
+  labels_combined:
+    type: Directory
+    doc: "Directory with 1 file per sequence, containing label values for each residue"
+    outputBinding:
+      glob: $(inputs.output_directory)
+
+
+
+
diff --git a/tests/wf/adv_prov/tools/combine_labels.py b/tests/wf/adv_prov/tools/combine_labels.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/decompress.cwl b/tests/wf/adv_prov/tools/decompress.cwl
new file mode 100644
index 000000000..8c68ccb6e
--- /dev/null
+++ b/tests/wf/adv_prov/tools/decompress.cwl
@@ -0,0 +1,32 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+requirements:
+  InitialWorkDirRequirement:
+    listing: $(inputs.pdb_archives)
+
+baseCommand: bash # gunzip
+
+arguments:
+  - -c
+  - |
+    set -ex; for file in *.gz; do mv \${file} \${file%%.gz}; done
+
+inputs:
+  pdb_archives:
+    type: File[]
+    # inputBinding:
+    #   position: 0
+
+outputs: 
+  cifs:
+    type: File[]
+    outputBinding:
+      glob: "*.cif"
+  pdbs:
+    type: File[]
+    outputBinding:
+      glob: "*.pdb"
+
diff --git a/tests/wf/adv_prov/tools/dssp.cwl b/tests/wf/adv_prov/tools/dssp.cwl
new file mode 100644
index 000000000..6279f2444
--- /dev/null
+++ b/tests/wf/adv_prov/tools/dssp.cwl
@@ -0,0 +1,102 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2 
+class: CommandLineTool
+baseCommand: bash  # python3
+
+doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files."
+intent: [ http://edamontology.org/operation_0320 ]
+
+requirements:
+  InlineJavascriptRequirement: {}
+  InitialWorkDirRequirement: # the script takes a directory as input
+    listing: |
+      ${
+           return [{"entry": {"class": "Directory", "basename": "pdb_source_dir", "listing": inputs.pdb_files}, "writable": true}]
+       }
+
+hints:
+  # DockerRequirement:
+  #   dockerPull: biopython/biopython@sha256:437075df44b0c9b3da96f71040baef0086789de7edf73c81de4ace30a127a245
+  SoftwareRequirement:
+    packages:
+      pandas:
+        version: [ "0.19.1" ]
+        specs: [ https://pypi.org/project/pandas/ ]
+      biopython:
+        specs: [ https://pypi.org/project/biopython/ ]
+        version: [ "1.75" ]
+      dssp:
+        specs: [ https://swift.cmbi.umcn.nl/gv/dssp/ ]
+        version: [ "2.0.4" ] # this version does not support mmCIF files
+      python:
+        version: [ "3.5" ]
+
+arguments:
+ # - $(inputs.script.path)
+ # - "pdb_source_dir"
+ # - "-o"
+ # - $(inputs.output_dir)
+ # - "-d"
+ # - $(inputs.dssp)
+ # - "-c"
+ # - $(inputs.rsa_cutoff)
+ - -c
+ - |
+   set -ex
+   mkdir $(inputs.output_dir)
+   touch $(inputs.output_dir)/$(inputs.pdb_files[0].nameroot)
+
+
+inputs:
+  script:
+    type: File
+    default: 
+      class: File
+      location: ./dssp_RASA.py 
+  pdb_files:
+    type: File[]
+    doc: "Protein structures in PDB format."
+  output_dir:
+    type: string
+    default: "dssp_output"
+  dssp:
+    type: string
+    default: "dssp" # for newer dssp versions: mkdssp
+  rsa_cutoff:
+    type: float
+    default: 0.06
+    doc: "Threshold exposed surface area for considering amino acids buried."
+
+outputs:
+  dssp_output_files:
+    type: Directory
+    outputBinding:
+      glob: $(inputs.output_dir)
+
+s:author:
+- class: s:Person
+  s:name: "Renske de Wit"
+s:license: https://spdx.org/licenses/Apache-2.0
+s:dateCreated: "2022-05-28"
+s:mainEntity:
+  class: s:SoftwareApplication
+  s:license: https://spdx.org/licenses/Apache-2.0
+  s:author:
+  - class: s:Person
+    s:name: "DS"
+  s:description: "Script which takes a directory of pdb files as input and calculates relative surface accessibility for each residue in the protein sequence."
+  s:basedOn:
+  - class: s:SoftwareApplication
+    s:name: "DSSP"
+  
+$namespaces:
+  s: https://schema.org/
+  edam: http://edamontology.org/
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+- https://edamontology.org/EDAM_1.25.owl
+
+
+
diff --git a/tests/wf/adv_prov/tools/dssp_RASA.py b/tests/wf/adv_prov/tools/dssp_RASA.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/emulated_model.py b/tests/wf/adv_prov/tools/emulated_model.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/epitope_annotation_pipeline.py b/tests/wf/adv_prov/tools/epitope_annotation_pipeline.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/epitope_annotations.cwl b/tests/wf/adv_prov/tools/epitope_annotations.cwl
new file mode 100644
index 000000000..7d744194e
--- /dev/null
+++ b/tests/wf/adv_prov/tools/epitope_annotations.cwl
@@ -0,0 +1,100 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2 
+class: CommandLineTool
+
+baseCommand: bash  # python3
+
+intent: [ http://edamontology.org/operation_0320 ]
+
+requirements:
+  InlineJavascriptRequirement: {}
+  InitialWorkDirRequirement: # the script takes a directory as input
+    listing: |
+      ${
+           return [{"entry": {"class": "Directory", "basename": "mmcif_directory", "listing": inputs.mmcif_files}, "writable": true}]
+       }
+
+doc: |
+  Runs Python script which takes directory of mmCIF files as input and outputs directory of FASTA files with protein sequence + epitope annotations.
+
+hints:
+  # DockerRequirement:
+  #   dockerImageId: pdbecif-pandas:20220620
+  #   dockerFile: |                                                               
+  #     FROM docker.io/debian:stable-slim                                                                                                                         
+  #     RUN apt-get update && apt-get install -y --no-install-recommends python3-pip
+  #     RUN python3 -m pip install PDBeCif pandas  
+  SoftwareRequirement:
+    packages:
+      pandas:
+        specs: [ https://anaconda.org/conda-forge/pandas ]
+        version: [ "1.2.4" ]
+      python:
+        version: [ "3.9.1" ]
+      pdbecif:
+        specs: [ https://pypi.org/project/PDBeCif/ ]
+        version: [ "1.5" ]
+
+arguments:
+ # - $(inputs.script.path)
+ # - "mmcif_directory"
+ # - $(inputs.sabdab_processed.path)
+ # - "--fasta_directory"
+ # - $(inputs.fasta_output_dir)
+ # - "--df_directory"
+ # - $(inputs.df_output_dir)
+ - -c
+ - |
+   mkdir $(inputs.fasta_output_dir) $(inputs.df_output_dir);
+   touch $(inputs.fasta_output_dir)/$(inputs.mmcif_files[0].basename).fasta 
+   touch $(inputs.df_output_dir)/$(inputs.mmcif_files[0].basename).df
+
+inputs:
+  script:
+    type: File
+    default:
+      class: File
+      location: ./epitope_annotation_pipeline.py
+  mmcif_files:
+    type: File[]
+    doc: mmCIF file array
+  sabdab_processed:
+    format: iana:text/csv 
+    type: File
+    doc: "table of PDB entries with associated H, L and antigen chain."
+  fasta_output_dir:
+    type: string
+    default: "./epitope_fasta"
+  df_output_dir:
+    type: string
+    default: "./epitope_df"
+
+outputs:
+  epitope_fasta_dir:
+    type: Directory
+    outputBinding:
+      glob: $(inputs.fasta_output_dir)
+  epitope_df_dir:
+    type: Directory
+    outputBinding:
+      glob: $(inputs.df_output_dir)
+
+s:dateCreated: 2022-05-30
+
+s:mainEntity:
+  s:additionalType: s:SoftwareApplication
+  s:author:
+  - s:name: "Katharina Waury"
+  s:dateCreated: 2022-02-10
+  s:programmingLanguage: Python
+  s:description: "Script which extracts epitope annotations and dataframes from mmCIF files."
+
+$namespaces:
+  s: https://schema.org/
+  edam: http://edamontology.org/
+  iana: https://www.iana.org/assignments/media-types/
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+- https://edamontology.org/EDAM_1.25.owl
diff --git a/tests/wf/adv_prov/tools/get_pc7_inputs.py b/tests/wf/adv_prov/tools/get_pc7_inputs.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/get_psp19_inputs.py b/tests/wf/adv_prov/tools/get_psp19_inputs.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl b/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl
new file mode 100644
index 000000000..eaba7bf1a
--- /dev/null
+++ b/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl
@@ -0,0 +1,70 @@
+cwlVersion: v1.2
+class: CommandLineTool
+baseCommand: bash  # hhblits
+
+doc: |
+  CommandLineTool for hhblits, part of HH-suite. See https://github.com/soedinglab/hh-suite for documentation.
+hints:
+   SoftwareRequirement:
+     packages:
+       hhsuite:
+         specs:
+           - https://anaconda.org/bioconda/hhsuite
+           - https://bio.tools/hhsuite
+         version: [ "3.3.0" ]
+#   DockerRequirement:
+#     dockerPull: quay.io/biocontainers/hhsuite:3.3.0--py39pl5321h67e14b5_5 # this is the version opus-tass uses?
+
+inputs:
+  protein_query_sequence: 
+    type: File
+    # format: [ 
+    #   edam:format_1929, # FASTA
+    #   edam:format_3281, # A2M
+    #   ]
+  database: Directory # too large to be included in RO, change later to type string = path to database
+  database_name: string
+  n_iterations: 
+    type: int
+    default: 2 # change this to the correct value
+
+
+arguments:
+ # - "-i"
+ # - $(inputs.protein_query_sequence.path) #$(inputs.fasta_dir.path)/$(inputs.protein_id).fasta
+ # - "-d"
+ # - $(inputs.database.path)/$(inputs.database_name)
+ # - "-o"
+ # - $(inputs.protein_query_sequence.nameroot).hhr
+ # - "-ohhm"
+ # - $(inputs.protein_query_sequence.nameroot).hhm
+ # - "-n"
+ # - $(inputs.n_iterations)
+ - -c
+ - |
+   set +ex
+   touch $(inputs.protein_query_sequence.nameroot).hhr
+   touch $(inputs.protein_query_sequence.nameroot).hhm
+
+
+outputs:
+  hhm:
+    type: File
+    outputBinding:
+      glob: "*.hhm"
+
+
+s:author: # Creator of this CWL document
+- s:identifier: https://orcid.org/0000-0003-0902-0086
+
+s:license: Apache-2.0
+
+$namespaces:
+  s: https://schema.org/
+  edam: http://edamontology.org/
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+- https://edamontology.org/EDAM_1.25.owl
+
+
diff --git a/tests/wf/adv_prov/tools/pc7_inputs.cwl b/tests/wf/adv_prov/tools/pc7_inputs.cwl
new file mode 100644
index 000000000..003fef3cf
--- /dev/null
+++ b/tests/wf/adv_prov/tools/pc7_inputs.cwl
@@ -0,0 +1,63 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.0
+class: CommandLineTool 
+# hints:
+#   DockerRequirement:
+#     dockerPull: amancevice/pandas:1.3.4-slim # Script needs numpy which is a dependency of pandas
+#   SoftwareRequirement:
+#     packages: 
+#       numpy:
+#         specs: [ https://anaconda.org/conda-forge/numpy ]
+
+baseCommand: bash  # python3
+
+doc: PC7 features are assigned to each residue in each protein sequence. Output is a directory of files (1 per sequence).
+# intent: [ http://edamontology.org/operation_0361 ]
+
+inputs:
+  script:
+    type: File
+    default: 
+      class: File
+      location: ./get_pc7_inputs.py 
+    # inputBinding: { position: 1 }
+  fasta:
+    type: Directory 
+    format: edam:format_2200 # fasta-like (text)
+    # inputBinding:
+    #   position: 2
+  outdir:
+    type: string
+    # inputBinding: 
+    #   position: 3
+    #   prefix: -o
+    default: "pc7_features"
+
+arguments:
+ - -c
+ - |
+   mkdir $(inputs.outdir)
+   touch $(inputs.outdir)/$(inputs.fasta.nameroot)
+
+outputs: 
+  pc7_features: 
+    type: Directory 
+    outputBinding:
+      glob: $(inputs.outdir)
+
+s:mainEntity: # add that this is a commandlinetool
+  s:programmingLanguage: Python
+  s:codeRepository: https://github.com/RenskeW/cwl-epitope/blob/b5e31d42006fd7003716f57963646d47d1154549/tools/get_pc7_inputs.py
+  s:isBasedOn:
+  - s:additionalType: s:SoftwareApplication
+    s:name: OPUS-TASS
+    s:identifier: https://bio.tools/opus-tass
+
+$namespaces:
+  s: https://schema.org/
+  edam: http://edamontology.org/
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+- https://edamontology.org/EDAM_1.25.owl
diff --git a/tests/wf/adv_prov/tools/pdb_batch_download.cwl b/tests/wf/adv_prov/tools/pdb_batch_download.cwl
new file mode 100644
index 000000000..97b3bde3b
--- /dev/null
+++ b/tests/wf/adv_prov/tools/pdb_batch_download.cwl
@@ -0,0 +1,88 @@
+#!/usr/env/bin cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+baseCommand: touch  #  bash
+
+doc: "Download files from the PDB in a specific format."
+
+intent: [ http://edamontology.org/operation_2422 ]
+requirements: 
+  NetworkAccess:
+    networkAccess: True
+
+inputs:
+  script:
+    type: File
+    # inputBinding:
+    #   position: 1
+    default:
+      class: File
+      location: ./pdb_batch_download.sh
+  input_file:
+    doc: "Comma-separated .txt file with pdb entries to download"
+    type: File
+    format: iana:text/csv
+    # inputBinding:
+    #   position: 3
+    #   prefix: "-f"
+  mmcif_format: # The last arguments specify the format in which entries will be downloaded
+    type: boolean
+    # inputBinding:
+    #   position: 4
+    #   prefix: "-c" # .cif.gz
+    default: True
+  pdb_format:
+    type: boolean
+    # inputBinding:
+    #   position: 5
+    #   prefix: "-p" # .pdb.gz
+    default: False
+  pdb1_format:
+    type: boolean
+    # inputBinding:
+    #   position: 6
+    #   prefix: "-a" # .pdb1.gz
+    default: False
+  xml_format:
+    type: boolean
+    # inputBinding:
+    #   position: 7
+    #   prefix: "-x" # .xml.gz
+    default: False
+  sfcif_format:
+    type: boolean
+    # inputBinding:
+    #   position: 8
+    #   prefix: "-s" # .sf.cif.gz
+    default: False
+  mr_format:
+    type: boolean
+    # inputBinding:
+    #   position: 9
+    #   prefix: "-m" # .mr.gz
+    default: False
+  mr_str_format:
+    type: boolean
+    # inputBinding:
+    #   position: 10
+    #   prefix: "-r" # .mr.str.gz
+    default: False  
+
+arguments:
+ - $(inputs.input_file.nameroot).1.cif.gz
+ - $(inputs.input_file.nameroot).2.cif.gz
+ - $(inputs.input_file.nameroot).1.pdb.gz
+ - $(inputs.input_file.nameroot).2.pdb.gz
+
+outputs:
+  pdb_files:
+    type: File[]
+    outputBinding:
+      glob: "*.gz"
+    doc: "Downloaded files"
+
+    
+$namespaces:
+  iana: https://www.iana.org/assignments/media-types/
diff --git a/tests/wf/adv_prov/tools/pdb_batch_download.sh b/tests/wf/adv_prov/tools/pdb_batch_download.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/pdb_query.cwl b/tests/wf/adv_prov/tools/pdb_query.cwl
new file mode 100644
index 000000000..62d34c584
--- /dev/null
+++ b/tests/wf/adv_prov/tools/pdb_query.cwl
@@ -0,0 +1,76 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+baseCommand: cat # python3
+
+requirements:
+  NetworkAccess: 
+    networkAccess: True
+
+intent: [ http://edamontology.org/operation_2421 ] # Database search
+
+hints:
+  # DockerRequirement:
+  #   dockerPull: nyurik/alpine-python3-requests@sha256:e0553236e3ebaa240752b41b8475afb454c5ab4c17eb023a2a904637eda16cf6
+  SoftwareRequirement:
+    packages:
+      python3:
+        version: [ 3.9.5 ]
+      requests:
+        version: [ 2.25.1 ]
+
+arguments:
+ # - $(inputs.script.path)
+   - $(inputs.pdb_search_query.path)
+ # - "--outpath"
+ # - $(inputs.return_file)
+
+stdout: $(inputs.return_file)
+
+inputs:
+  script:
+    type: File
+    default:
+      class: File
+      location: ./pdb_query.py
+  pdb_search_query:
+    type: File
+    label: Query for PDB search API in json format
+    format: iana:application/json
+  return_file:
+    type: string
+    label: Path to output file
+    default: "./pdb_ids.txt"
+    doc: "Comma-separated text file with PDB ids"
+
+outputs:
+  processed_response:
+    type: File
+    format: iana:text/csv
+    doc: Comma-separated text file with returned identifiers from PDB search API
+    outputBinding:
+       glob: $(inputs.return_file)
+
+# label: Query PDB search API and store output in comma-separated text file.
+
+doc: |
+  This tool invokes a Python script which uses requests library to query PDB search API and return a comma-separated file of identifiers returned by the API.
+  More information about PDB search API: https://search.rcsb.org/index.html
+
+
+$namespaces:
+  iana: https://www.iana.org/assignments/media-types/
+  s: https://www.schema.org/
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+
+s:author:
+- s:identifier: https://orcid.org/0000-0003-0902-0086
+
+s:mainEntity:
+  s:author:
+  - s:identifier: https://orcid.org/0000-0003-0902-0086
+
diff --git a/tests/wf/adv_prov/tools/pdb_query.py b/tests/wf/adv_prov/tools/pdb_query.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/ppi_annotations.cwl b/tests/wf/adv_prov/tools/ppi_annotations.cwl
new file mode 100644
index 000000000..2ce630b18
--- /dev/null
+++ b/tests/wf/adv_prov/tools/ppi_annotations.cwl
@@ -0,0 +1,77 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+baseCommand: bash # python3
+
+doc: "Extract PPI annotations from BioDL."
+intent: [ http://edamontology.org/operation_0320 ]
+
+hints:
+  # DockerRequirement:
+  #   dockerImageId: pdbecif-pandas:20220620
+  #   dockerFile: |                                                               
+  #     FROM docker.io/debian:stable-slim                                                                                                                         
+  #     RUN apt-get update && apt-get install -y --no-install-recommends python3-pip
+  #     RUN python3 -m pip install PDBeCif pandas  
+  SoftwareRequirement:
+    packages:
+      pandas:
+        specs: [ https://anaconda.org/conda-forge/pandas ]
+        version: [ "1.2.4" ]
+      python:
+        version: [ "3.9.1" ]
+      pdbecif:
+        specs: [ https://pypi.org/project/PDBeCif/ ]
+        version: [ "1.5" ]
+
+requirements:
+  InlineJavascriptRequirement: {}
+  InitialWorkDirRequirement: # the script takes a directory as input
+    listing:
+       - entry: |
+           ${ return {"class": "Directory",
+                      "listing": inputs.mmcif_files };
+            }
+         entryname: mmcif_directory
+         # writable: true
+
+inputs:
+  script:
+    type: File
+    default:
+      class: File
+      location: ./ppi_annotations.py
+  mmcif_files: # the download leaves us with an array of files, but script takes type Directory --> InitialWorkdirRequirement
+    type: File[]
+  train_dataset:
+    type: File
+    doc: "BioDL training set"
+  test_dataset:
+    type: File
+    doc: "BioDL test set"
+  output_directory_name:
+    type: string
+    default: "ppi_fasta"
+
+arguments:
+# - $(inputs.script.path)
+# - "mmcif_directory"
+# - $(inputs.train_dataset.path)
+# - $(inputs.test_dataset.path)
+# - "--outdir"
+#- $(inputs.output_directory)
+- -c
+- |
+  set -ex
+  mkdir $(inputs.output_directory_name)
+  touch $(inputs.output_directory_name)/$(inputs.train_dataset.nameroot)
+  touch $(inputs.output_directory_name)/$(inputs.test_dataset.nameroot)
+
+
+outputs:
+  ppi_fasta_files:
+    type: Directory
+    outputBinding:
+      glob: $(inputs.output_directory_name) 
diff --git a/tests/wf/adv_prov/tools/ppi_annotations.py b/tests/wf/adv_prov/tools/ppi_annotations.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/process_sabdab.cwl b/tests/wf/adv_prov/tools/process_sabdab.cwl
new file mode 100644
index 000000000..2f627fa15
--- /dev/null
+++ b/tests/wf/adv_prov/tools/process_sabdab.cwl
@@ -0,0 +1,67 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+doc: "Preprocess SAbDab summary file."
+intent: [ http://edamontology.org/operation_2409 ]
+
+hints:
+  # DockerRequirement:
+  #   dockerPull: amancevice/pandas:1.3.4-slim
+  SoftwareRequirement:
+    packages:
+      python:
+        version: [ "3.9.7" ]
+      pandas:
+        version: [ "1.3.4" ]
+
+baseCommand: cat  # python3
+
+arguments:
+# - $(inputs.script.path)
+  - $(inputs.sabdab_summary.path)
+# - "-o"
+# - $(inputs.results_name)
+
+stdout:  $(inputs.results_name)
+
+inputs:
+  script:
+    type: File
+    default:
+      class: File
+      location: ./process_sabdab_summary.py
+  sabdab_summary:
+    type: File
+    label: Summary file downloaded from SAbDab.
+    format: iana:text/tab-separated-values
+  results_name:
+    type: string
+    label: Name of output file in which processed results are stored.
+    default: "SAbDab_protein_antigens_PDB_chains.csv"
+
+outputs:
+  processed_summary:
+    type: File
+    format: iana:text/csv
+    outputBinding:
+      glob: $(inputs.results_name)
+
+s:author:
+- class: s:Person
+  s:name: "Renske de Wit"
+  s:identifier: https://orcid.org/0000-0003-0902-0086
+s:license: https://spdx.org/licenses/Apache-2.0
+
+s:mainEntity:
+  class: s:SoftwareApplication
+  s:license: https://spdx.org/licenses/Apache-2.0
+  s:author:
+  - class: s:Person
+    s:name: "Katharina Waury"
+    s:identifier: <Kathi's ORCID identifier>
+
+$namespaces:
+  iana: "https://www.iana.org/assignments/media-types/"
+  s: "https://schema.org/"
diff --git a/tests/wf/adv_prov/tools/process_sabdab_summary.py b/tests/wf/adv_prov/tools/process_sabdab_summary.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/wf/adv_prov/tools/psp19_inputs.cwl b/tests/wf/adv_prov/tools/psp19_inputs.cwl
new file mode 100644
index 000000000..0b34196ae
--- /dev/null
+++ b/tests/wf/adv_prov/tools/psp19_inputs.cwl
@@ -0,0 +1,54 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool 
+# hints:
+#   DockerRequirement:
+#     dockerPull: amancevice/pandas:1.3.4-slim
+#   SoftwareRequirement:
+#     packages: 
+#       numpy:
+#         specs: [ https://anaconda.org/conda-forge/numpy ]
+      # python:
+      #   version:
+
+baseCommand: bash  # python3
+
+inputs:
+  script:
+    type: File
+    default: 
+      class: File
+      location: ./get_psp19_inputs.py  
+    # inputBinding: 
+    #   position: 1
+  fasta:
+    type: Directory
+    format: edam:format_2200
+    # inputBinding:
+    #   position: 2
+  outdir:
+    type: string
+    # inputBinding: 
+    #   position: 3
+    #   prefix: -o
+    default: "psp19_features"
+
+arguments:
+ - -c
+ - |
+   set -ex
+   mkdir $(inputs.outdir)
+   touch $(inputs.outdir)/$(inputs.fasta.nameroot)
+
+outputs: 
+  psp19_features: 
+    type: Directory 
+    outputBinding:
+      glob: $(inputs.outdir)
+
+$namespaces:
+  edam: http://edamontology.org/
+
+$schemas:
+- https://edamontology.org/EDAM_1.25.owl
diff --git a/tests/wf/adv_prov/tools/train_epitope_model.cwl b/tests/wf/adv_prov/tools/train_epitope_model.cwl
new file mode 100644
index 000000000..141180356
--- /dev/null
+++ b/tests/wf/adv_prov/tools/train_epitope_model.cwl
@@ -0,0 +1,68 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+baseCommand: python3
+
+doc: "Model training."
+intent: [ http://edamontology.org/operation_2423 ]
+hints:
+  SoftwareRequirement:
+    packages:
+      python:
+        version: [ "3.9" ]
+      tqdm:
+        specs: [ https://pypi.org/project/tqdm/ ]
+        version: [ "4.64.0" ]
+      tensorflow-gpu:
+        specs: [ https://pypi.org/project/tensorflow-gpu/ ]
+        version: [ 2.9.1 ]
+      tensorflow-addons:
+        specs: [ https://pypi.org/project/tensorflow-addons/ ]
+        version: [ "0.17.1" ]
+      numpy:
+        version: [ "1.21.5" ]
+      click:
+        version: [ "8.0.4" ]
+      commentjson:
+        specs: [ https://pypi.org/project/commentjson/ ]
+        version: [ "0.9.0" ]
+
+arguments: 
+- $(inputs.script.path)
+- $(inputs.config_file.path)
+- $(inputs.input_features.path)
+- $(inputs.input_labels.path)
+
+inputs:
+  script:
+    type: File
+    default:
+      class: File
+      location: ./emulated_model.py # this is a placeholder script
+  config_file:
+    type: File
+    default:
+      class: File
+      location: ../model_example_params.json
+    doc: "Configuration file used for the model. Here: standard file, but in real workflow it should be generated from previous steps."
+  input_features:
+    type: Directory
+  input_labels:
+    type: Directory
+
+  
+stdout: "training_log.txt"
+
+outputs:
+  train_log:
+    type: stdout
+    doc: "Output of the model containing predictions and/or performance on the test set."
+
+
+
+
+
+
+