start of a more advanced test

common-workflow-language · Dec 15, 2022 · 5466f95 · 5466f95
1 parent e2009ac
commit 5466f95
Show file tree

Hide file tree

Showing 34 changed files with 1,246 additions and 4 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -9,6 +9,9 @@ include tests/loop/*
 include tests/tmp1/tmp2/tmp3/.gitkeep
 include tests/tmp4/alpha/*
 include tests/wf/*
+include tests/wf/adv_prov/*
+include tests/wf/adv_prov/data/*
+include tests/wf/adv_prov/tools/*
 include tests/wf/operation/*
 include tests/override/*
 include tests/reloc/*.cwl

diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py
@@ -404,10 +404,16 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st
         # Transfer SCHEMA annotations to provenance
         for s in schema_annotations:
             if "additionalType" in s:
-                additional_type = cast(str, schema_annotations[s]).split(sep="/")[
-                    -1
-                ]  # find better method?
-                file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
+                atype = schema_annotations[s]
+                if isinstance(atype, str):
+                    additional_type = atype.split(sep="/")[-1]  # find better method?
+                    file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
+                else:
+                    for a_entry in cast(List[str], atype):
+                        additional_type = a_entry.split(sep="/")[
+                            -1
+                        ]  # find better method?
+                        file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
             else:
                 file_entity = self._add_nested_annotations(
                     s, schema_annotations[s], file_entity

diff --git a/tests/test_provenance.py b/tests/test_provenance.py
@@ -103,6 +103,27 @@ def test_revsort_label_annotations(tmp_path: Path) -> None:
         )
 
 
+def test_advanced_prov_annotations(tmp_path: Path) -> None:
+    """Pass through of advanced input annotations."""
+    base_path = cwltool(
+        tmp_path,
+        get_data("tests/wf/adv_prov/niaa_wf.cwl"),
+        get_data("tests/wf/adv_prov/niaa_wf_job.yml"),
+    )
+    prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
+    arcp_root = find_arcp(base_path)
+    g = Graph()
+    with open(prov_file, "rb") as f:
+        g.parse(file=f, format="nt", publicID=arcp_root)
+    mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
+    assert len(mime_having_objects) == 8
+    # for obj in mime_having_objects:
+    #     assert (
+    #         cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
+    #         == "https://www.iana.org/assignments/media-types/text/plain"
+    #     )
+
+
 @needs_docker
 def test_nested_workflow(tmp_path: Path) -> None:
     check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True)

diff --git a/tests/wf/adv_prov/data/pdb_query.json b/tests/wf/adv_prov/data/pdb_query.json
diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv
diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv
diff --git a/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv b/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv
diff --git a/tests/wf/adv_prov/model_example_params.json b/tests/wf/adv_prov/model_example_params.json
diff --git a/tests/wf/adv_prov/niaa_wf.cwl b/tests/wf/adv_prov/niaa_wf.cwl
@@ -0,0 +1,186 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: Workflow
+
+intent: [ edam:operation_2423 ]  # Prediction ope
+doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction."
+
+requirements:
+  ScatterFeatureRequirement: {}
+  StepInputExpressionRequirement: {}
+  SubworkflowFeatureRequirement: {}
+
+inputs: 
+  sabdab_summary: 
+    type: File
+    format: iana:text/tab-separated-values
+    doc: "SAbDAb Summary metadata about all structures in the database."
+  biodl_train_dataset: 
+    type: File
+    format: iana:text/csv
+    doc: "BioDL training dataset containing PPI interactions"
+  biodl_test_dataset: 
+    type: File
+    doc: "BioDL test dataset with PPI interactions."
+  hhblits_db:
+    type: Directory
+    doc: "Reference database for HHblits"
+  hhblits_db_name: 
+    type: string
+    doc: "Name of hhblits reference database"
+  pdb_search_api_query: 
+    type: File
+    format: iana:application/json
+    doc: "Structured query for PDB API."
+
+outputs: 
+  model_output:
+    type: File
+    outputSource: train_epitope_prediction_model/train_log
+    doc: "Output of the prediction model."
+
+steps:
+  run_pdb_query:
+    in:
+      pdb_search_query: pdb_search_api_query
+    out:
+      [ processed_response ]
+    run: ./tools/pdb_query.cwl
+    doc: |
+      Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements.
+      See https://search.rcsb.org/index.html#search-api for a tutorial.
+
+  download_pdb_files:
+    in: 
+      input_file: run_pdb_query/processed_response 
+      mmcif_format: { default: True }
+      pdb_format: { default: True }
+    out:
+      [ pdb_files ]
+    run: ./tools/pdb_batch_download.cwl
+
+  decompress_pdb_files:
+    in:
+      pdb_archives: download_pdb_files/pdb_files
+    out: [ cifs, pdbs ]
+    run: ./tools/decompress.cwl
+    doc: "Decompress files using gzip"
+
+  generate_dssp_labels:
+    in:
+      pdb_files: decompress_pdb_files/pdbs # change this later
+      rsa_cutoff: { default :  0.06 }
+    out: [ dssp_output_files ]
+    run: ./tools/dssp.cwl
+    doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files."
+
+  generate_ppi_labels:
+    in:
+      mmcif_files: decompress_pdb_files/cifs
+      train_dataset: biodl_train_dataset
+      test_dataset: biodl_test_dataset
+    out: [ ppi_fasta_files ]
+    run: ./tools/ppi_annotations.cwl
+    doc: "Extract ppi annotations from BioDL. This step is partly emulated."
+
+  preprocess_sabdab_data:
+    doc: "Extract antigen chains from SAbDab summary file."
+    in:
+      sabdab_summary: sabdab_summary
+    out: [ processed_summary ]
+    run: ./tools/process_sabdab.cwl
+
+  generate_epitope_labels:
+    in: 
+      mmcif_files: decompress_pdb_files/cifs
+      sabdab_processed: preprocess_sabdab_data/processed_summary
+    out: [ epitope_fasta_dir ]
+    run: ./tools/epitope_annotations.cwl
+    doc: "Extract epitope annotations from PDB files."
+
+  combine_labels:
+    doc: "Combine labels into 1 file per protein sequence."
+    run: ./tools/combine_labels.cwl
+    in:
+      epitope_directory: generate_epitope_labels/epitope_fasta_dir
+      ppi_directory: generate_ppi_labels/ppi_fasta_files
+      dssp_directory: generate_dssp_labels/dssp_output_files
+    out: [ labels_combined ]
+
+  generate_pc7:
+    doc: Calculate PC7 features for each residue in each protein sequence.
+    run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input
+    in: 
+      fasta: generate_ppi_labels/ppi_fasta_files 
+    out: [ pc7_features ]  
+
+  generate_psp19:
+    label: Calculate PSP19 features for each residue in each protein sequence.
+    run: ./tools/psp19_inputs.cwl
+    in:
+      fasta: generate_ppi_labels/ppi_fasta_files
+    out: [ psp19_features ]
+
+  generate_hhm:
+    in:
+      query_sequences: 
+        source: generate_ppi_labels/ppi_fasta_files # type Directory
+        valueFrom: $(self.listing) # here type Directory is converted to File array
+      hhblits_db: hhblits_db
+      hhblits_db_name: hhblits_db_name
+      hhblits_n_iterations: { default: 1 }
+    out: [ hhm_file_array ]
+    run:
+      class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File
+      inputs:
+        query_sequences: File[]
+        hhblits_db: Directory
+        hhblits_db_name: string
+        hhblits_n_iterations: int
+      outputs:
+        hhm_file_array:
+          type: File[]
+          outputSource: run_hhblits/hhm
+      steps:
+        run_hhblits:
+          in: 
+            protein_query_sequence: query_sequences
+            database: hhblits_db
+            database_name: hhblits_db_name
+            n_iterations: hhblits_n_iterations
+          out: [ hhm ]
+          scatter: protein_query_sequence
+          run: ./tools/hhm_inputs_scatter.cwl
+  combine_features:
+    in: 
+      input_sequences: generate_ppi_labels/ppi_fasta_files
+      pc7_features: generate_pc7/pc7_features
+      psp19_features: generate_psp19/psp19_features
+      hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory
+    out: [ combined_features ]
+    run: ./tools/combine_features.cwl  
+
+  train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow.
+    in: # in the real workflow, the configuration file would be generated as part of the workflow as well
+      input_features: combine_features/combined_features
+      input_labels: combine_labels/labels_combined
+    out: [ train_log ] 
+    run: ./tools/train_epitope_model.cwl
+    doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet."  
+
+$namespaces:
+  iana: "https://www.iana.org/assignments/media-types/"
+  s: "https://schema.org/"
+  edam: "http://edamontology.org/"
+  cwlprov: "https://w3id.org/cwl/prov#"
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+- https://edamontology.org/EDAM_1.25.owl
+
+s:author:
+- s:name: "Renske de Wit"
+  s:identifier: https://orcid.org/0000-0003-0902-0086
+- s:name: "Katharina Waury"
+s:license: https://spdx.org/licenses/Apache-2.0
diff --git a/tests/wf/adv_prov/niaa_wf_job.yml b/tests/wf/adv_prov/niaa_wf_job.yml
@@ -0,0 +1,78 @@
+cwlprov:prov:
+  sabdab_search:
+    s:additionalType: s:SearchAction
+    s:query: "All structures"
+    s:endTime: 2022-05-27
+    s:object:
+      s:name: "Structural Antibody Database"
+      s:citation:
+        s:identifier: https://doi.org/10.1093/nar/gkab1050
+    s:result: sabdab_summary
+    s:description: "Search Action for metadata on antibody-antigen complexes in SAbDab"
+
+
+pdb_search_api_query:
+  class: File
+  location: ./data/pdb_query.json
+  format: iana:application/json
+  s:description: "Input query for PDB search API."
+  s:additionalType:
+  - edam:data_3786 # Query script
+
+sabdab_summary:
+  class: File
+  path: ./data/sabdab_summary_all_20220527.tsv
+  format: iana:text/tab-separated-values
+  s:description: "Summary file downloaded from SAbDAb database, containing metadata for all structures."
+  s:additionalType:
+  - edam:data_2080 # database search results
+  - s:Dataset
+
+
+biodl_train_dataset:
+  class: File
+  path: data/prepared_biolip_win_p_training.csv
+  #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_training.csv
+  format: iana:text/csv
+  s:description: "BioDL training set containing PPI annotations for protein sequences (UniProt IDs)"
+  s:name: "BioDL training dataset"
+  s:citation:
+    s:identifier: https://doi.org/10.1093/bioinformatics/btac071
+  s:additionalType:
+  - s:Dataset
+  - edam:data_1277 # protein features
+
+biodl_test_dataset:
+  class: File
+  path: data/prepared_biolip_win_p_testing.csv
+  #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_testing.csv
+  s:description: "BioDL test set containing PPI annotations for protein sequences (UniProt IDs)."
+  s:name: "BioDL test dataset"
+  s:citation:
+    s:identifier: https://doi.org/10.1093/bioinformatics/btac071
+  s:additionalType:
+  - s:Dataset
+  - edam:data_1277 # protein features
+
+hhblits_db: 
+  class: Directory
+  location: ../hhblits/databases
+  s:citation:
+    s:identifier: https://doi.org/10.1038/nmeth.1818
+  s:name: "pdb70"
+  s:description: "Directory containing HHBlits reference database."
+  s:additionalType:
+  - s:Dataset
+  - edam:data_0955 # data index
+
+hhblits_db_name: pdb70
+hhblits_n_iterations: 1
+
+s:description: "Demonstration run of epitope prediction workflow. Some steps are emulated, so the results of the workflow are not yet biologically meaningful."
+
+$namespaces:
+  iana: "https://www.iana.org/assignments/media-types/"
+  s: "https://schema.org/"
+  edam: "http://edamontology.org/"
+  cwlprov: "https://w3id.org/cwl/prov#"
+