Skip to content

Commit

Permalink
start of a more advanced test
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-c committed Dec 15, 2022
1 parent e2009ac commit 09193f7
Show file tree
Hide file tree
Showing 33 changed files with 1,243 additions and 4 deletions.
14 changes: 10 additions & 4 deletions cwltool/provenance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,10 +404,16 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st
# Transfer SCHEMA annotations to provenance
for s in schema_annotations:
if "additionalType" in s:
additional_type = cast(str, schema_annotations[s]).split(sep="/")[
-1
] # find better method?
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
atype = schema_annotations[s]
if isinstance(atype, str):
additional_type = atype.split(sep="/")[-1] # find better method?
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
else:
for a_entry in cast(List[str], atype):
additional_type = a_entry.split(sep="/")[
-1
] # find better method?
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
else:
file_entity = self._add_nested_annotations(
s, schema_annotations[s], file_entity
Expand Down
21 changes: 21 additions & 0 deletions tests/test_provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,27 @@ def test_revsort_label_annotations(tmp_path: Path) -> None:
)


def test_advanced_prov_annotations(tmp_path: Path) -> None:
"""Pass through of advanced input annotations."""
base_path = cwltool(
tmp_path,
get_data("tests/wf/adv_prov/niaa_wf.cwl"),
get_data("tests/wf/adv_prov/niaa_wf_job.yml"),
)
prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
arcp_root = find_arcp(base_path)
g = Graph()
with open(prov_file, "rb") as f:
g.parse(file=f, format="nt", publicID=arcp_root)
mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
assert len(mime_having_objects) == 8
# for obj in mime_having_objects:
# assert (
# cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
# == "https://www.iana.org/assignments/media-types/text/plain"
# )


@needs_docker
def test_nested_workflow(tmp_path: Path) -> None:
check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True)
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
186 changes: 186 additions & 0 deletions tests/wf/adv_prov/niaa_wf.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#!/usr/bin/env cwl-runner

cwlVersion: v1.2
class: Workflow

intent: [ edam:operation_2423 ] # Prediction ope
doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction."

requirements:
ScatterFeatureRequirement: {}
StepInputExpressionRequirement: {}
SubworkflowFeatureRequirement: {}

inputs:
sabdab_summary:
type: File
format: iana:text/tab-separated-values
doc: "SAbDAb Summary metadata about all structures in the database."
biodl_train_dataset:
type: File
format: iana:text/csv
doc: "BioDL training dataset containing PPI interactions"
biodl_test_dataset:
type: File
doc: "BioDL test dataset with PPI interactions."
hhblits_db:
type: Directory
doc: "Reference database for HHblits"
hhblits_db_name:
type: string
doc: "Name of hhblits reference database"
pdb_search_api_query:
type: File
format: iana:application/json
doc: "Structured query for PDB API."

outputs:
model_output:
type: File
outputSource: train_epitope_prediction_model/train_log
doc: "Output of the prediction model."

steps:
run_pdb_query:
in:
pdb_search_query: pdb_search_api_query
out:
[ processed_response ]
run: ./tools/pdb_query.cwl
doc: |
Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements.
See https://search.rcsb.org/index.html#search-api for a tutorial.

download_pdb_files:
in:
input_file: run_pdb_query/processed_response
mmcif_format: { default: True }
pdb_format: { default: True }
out:
[ pdb_files ]
run: ./tools/pdb_batch_download.cwl

decompress_pdb_files:
in:
pdb_archives: download_pdb_files/pdb_files
out: [ cifs, pdbs ]
run: ./tools/decompress.cwl
doc: "Decompress files using gzip"

generate_dssp_labels:
in:
pdb_files: decompress_pdb_files/pdbs # change this later
rsa_cutoff: { default : 0.06 }
out: [ dssp_output_files ]
run: ./tools/dssp.cwl
doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files."

generate_ppi_labels:
in:
mmcif_files: decompress_pdb_files/cifs
train_dataset: biodl_train_dataset
test_dataset: biodl_test_dataset
out: [ ppi_fasta_files ]
run: ./tools/ppi_annotations.cwl
doc: "Extract ppi annotations from BioDL. This step is partly emulated."

preprocess_sabdab_data:
doc: "Extract antigen chains from SAbDab summary file."
in:
sabdab_summary: sabdab_summary
out: [ processed_summary ]
run: ./tools/process_sabdab.cwl

generate_epitope_labels:
in:
mmcif_files: decompress_pdb_files/cifs
sabdab_processed: preprocess_sabdab_data/processed_summary
out: [ epitope_fasta_dir ]
run: ./tools/epitope_annotations.cwl
doc: "Extract epitope annotations from PDB files."

combine_labels:
doc: "Combine labels into 1 file per protein sequence."
run: ./tools/combine_labels.cwl
in:
epitope_directory: generate_epitope_labels/epitope_fasta_dir
ppi_directory: generate_ppi_labels/ppi_fasta_files
dssp_directory: generate_dssp_labels/dssp_output_files
out: [ labels_combined ]

generate_pc7:
doc: Calculate PC7 features for each residue in each protein sequence.
run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input
in:
fasta: generate_ppi_labels/ppi_fasta_files
out: [ pc7_features ]

generate_psp19:
label: Calculate PSP19 features for each residue in each protein sequence.
run: ./tools/psp19_inputs.cwl
in:
fasta: generate_ppi_labels/ppi_fasta_files
out: [ psp19_features ]

generate_hhm:
in:
query_sequences:
source: generate_ppi_labels/ppi_fasta_files # type Directory
valueFrom: $(self.listing) # here type Directory is converted to File array
hhblits_db: hhblits_db
hhblits_db_name: hhblits_db_name
hhblits_n_iterations: { default: 1 }
out: [ hhm_file_array ]
run:
class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File
inputs:
query_sequences: File[]
hhblits_db: Directory
hhblits_db_name: string
hhblits_n_iterations: int
outputs:
hhm_file_array:
type: File[]
outputSource: run_hhblits/hhm
steps:
run_hhblits:
in:
protein_query_sequence: query_sequences
database: hhblits_db
database_name: hhblits_db_name
n_iterations: hhblits_n_iterations
out: [ hhm ]
scatter: protein_query_sequence
run: ./tools/hhm_inputs_scatter.cwl
combine_features:
in:
input_sequences: generate_ppi_labels/ppi_fasta_files
pc7_features: generate_pc7/pc7_features
psp19_features: generate_psp19/psp19_features
hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory
out: [ combined_features ]
run: ./tools/combine_features.cwl

train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow.
in: # in the real workflow, the configuration file would be generated as part of the workflow as well
input_features: combine_features/combined_features
input_labels: combine_labels/labels_combined
out: [ train_log ]
run: ./tools/train_epitope_model.cwl
doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet."

$namespaces:
iana: "https://www.iana.org/assignments/media-types/"
s: "https://schema.org/"
edam: "http://edamontology.org/"
cwlprov: "https://w3id.org/cwl/prov#"

$schemas:
- https://schema.org/version/latest/schemaorg-current-https.rdf
- https://edamontology.org/EDAM_1.25.owl

s:author:
- s:name: "Renske de Wit"
s:identifier: https://orcid.org/0000-0003-0902-0086
- s:name: "Katharina Waury"
s:license: https://spdx.org/licenses/Apache-2.0
78 changes: 78 additions & 0 deletions tests/wf/adv_prov/niaa_wf_job.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
cwlprov:prov:
sabdab_search:
s:additionalType: s:SearchAction
s:query: "All structures"
s:endTime: 2022-05-27
s:object:
s:name: "Structural Antibody Database"
s:citation:
s:identifier: https://doi.org/10.1093/nar/gkab1050
s:result: sabdab_summary
s:description: "Search Action for metadata on antibody-antigen complexes in SAbDab"


pdb_search_api_query:
class: File
location: ./data/pdb_query.json
format: iana:application/json
s:description: "Input query for PDB search API."
s:additionalType:
- edam:data_3786 # Query script

sabdab_summary:
class: File
path: ./data/sabdab_summary_all_20220527.tsv
format: iana:text/tab-separated-values
s:description: "Summary file downloaded from SAbDAb database, containing metadata for all structures."
s:additionalType:
- edam:data_2080 # database search results
- s:Dataset


biodl_train_dataset:
class: File
path: data/prepared_biolip_win_p_training.csv
#location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_training.csv
format: iana:text/csv
s:description: "BioDL training set containing PPI annotations for protein sequences (UniProt IDs)"
s:name: "BioDL training dataset"
s:citation:
s:identifier: https://doi.org/10.1093/bioinformatics/btac071
s:additionalType:
- s:Dataset
- edam:data_1277 # protein features

biodl_test_dataset:
class: File
path: data/prepared_biolip_win_p_testing.csv
#location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_testing.csv
s:description: "BioDL test set containing PPI annotations for protein sequences (UniProt IDs)."
s:name: "BioDL test dataset"
s:citation:
s:identifier: https://doi.org/10.1093/bioinformatics/btac071
s:additionalType:
- s:Dataset
- edam:data_1277 # protein features

hhblits_db:
class: Directory
location: ../hhblits/databases
s:citation:
s:identifier: https://doi.org/10.1038/nmeth.1818
s:name: "pdb70"
s:description: "Directory containing HHBlits reference database."
s:additionalType:
- s:Dataset
- edam:data_0955 # data index

hhblits_db_name: pdb70
hhblits_n_iterations: 1

s:description: "Demonstration run of epitope prediction workflow. Some steps are emulated, so the results of the workflow are not yet biologically meaningful."

$namespaces:
iana: "https://www.iana.org/assignments/media-types/"
s: "https://schema.org/"
edam: "http://edamontology.org/"
cwlprov: "https://w3id.org/cwl/prov#"

Loading

0 comments on commit 09193f7

Please sign in to comment.