From b954975070555daf4697cac80898f38213d6016e Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Tue, 7 Jun 2022 17:50:16 +0200 Subject: [PATCH 01/14] Propagate 'label' to primary.cwlprov file --- cwltool/provenance_profile.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 025f5581d..58511e533 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -350,6 +350,10 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) self.document.specializationOf(file_entity, entity) + # Transfer input data annotations to provenance: + if "label" in value: + file_entity.add_attributes({PROV_LABEL: value["label"]}) + # Check for secondaries for sec in cast( MutableSequence[CWLObjectType], value.get("secondaryFiles", []) From 93f091a3582e7cf3863d839e137a0698b58c50ea Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Thu, 9 Jun 2022 09:32:58 +0200 Subject: [PATCH 02/14] Run make format --- cwltool/provenance_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 58511e533..f82feb63d 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -352,7 +352,7 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st # Transfer input data annotations to provenance: if "label" in value: - file_entity.add_attributes({PROV_LABEL: value["label"]}) + file_entity.add_attributes({PROV_LABEL: value["label"]}) # Check for secondaries for sec in cast( From 99fc1967cd0941a8694ea9cd3754a467ad818bad Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Mon, 20 Jun 2022 12:44:31 +0200 Subject: [PATCH 03/14] Propagate schema.org annotations to provenance --- cwltool/provenance_profile.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index f82feb63d..554016c90 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -350,9 +350,23 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) self.document.specializationOf(file_entity, entity) + def recursive_function(dataset, e: ProvEntity) -> ProvEntity: + for annotation in dataset: + if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types + e.add_attributes({annotation: dataset[annotation]}) + else: + nested_id = uuid.uuid4().urn + # e.add_attributes({annotation: nested_id}) + nested_entity = self.document.entity(nested_id) + e.add_attributes({annotation: nested_entity.identifier}) + nested_entity = recursive_function(dataset[annotation], nested_entity) + return e + # Transfer input data annotations to provenance: - if "label" in value: - file_entity.add_attributes({PROV_LABEL: value["label"]}) + if SCHEMA["Dataset"].uri in value: + entity.add_attributes( {PROV_TYPE: SCHEMA["Dataset"]}) + entity = recursive_function(value[SCHEMA["Dataset"].uri], entity) + # Check for secondaries for sec in cast( From c6e738566b51248ae24dbd1bdbff716057672217 Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:41:58 +0200 Subject: [PATCH 04/14] Added format annotations to provenance + started Directory annotations --- cwltool/provenance_profile.py | 47 ++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 554016c90..a10f69df4 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -296,6 +296,25 @@ def record_process_end( self.generate_output_prov(outputs, process_run_id, process_name) self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) + + + # def _add_nested_annotations(dataset, e: ProvEntity) -> ProvEntity: + # for annotation in dataset: + # if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types + # e.add_attributes({annotation: dataset[annotation]}) + # else: + # nested_id = uuid.uuid4().urn + # # e.add_attributes({annotation: nested_id}) + # nested_entity = self.document.entity(nested_id) + # e.add_attributes({annotation: nested_entity.identifier}) + # nested_entity = _add_nested_annotations(dataset[annotation], nested_entity) + # return e + + # def _propagate_input_annotations(entity): + # entity.add_attributes( {PROV_TYPE: SCHEMA["Dataset"]}) + # entity = _add_nested_annotations(value[SCHEMA["Dataset"].uri], entity) + # return entity + def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]: if value["class"] != "File": raise ValueError("Must have class:File: %s" % value) @@ -350,7 +369,9 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) self.document.specializationOf(file_entity, entity) - def recursive_function(dataset, e: ProvEntity) -> ProvEntity: + + + def _add_nested_annotations(dataset, e: ProvEntity) -> ProvEntity: for annotation in dataset: if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types e.add_attributes({annotation: dataset[annotation]}) @@ -359,14 +380,17 @@ def recursive_function(dataset, e: ProvEntity) -> ProvEntity: # e.add_attributes({annotation: nested_id}) nested_entity = self.document.entity(nested_id) e.add_attributes({annotation: nested_entity.identifier}) - nested_entity = recursive_function(dataset[annotation], nested_entity) + nested_entity = _add_nested_annotations(dataset[annotation], nested_entity) return e # Transfer input data annotations to provenance: - if SCHEMA["Dataset"].uri in value: + if SCHEMA["Dataset"].uri in value: # TODO: modify so both http:/ and https:/ are recognized entity.add_attributes( {PROV_TYPE: SCHEMA["Dataset"]}) - entity = recursive_function(value[SCHEMA["Dataset"].uri], entity) + entity = _add_nested_annotations(value[SCHEMA["Dataset"].uri], entity) + # Transfer format annotations to provenance: + if "format" in value: + entity.add_attributes({SCHEMA["encodingFormat"]: value["format"]}) # Check for secondaries for sec in cast( @@ -413,6 +437,7 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: (PROV_TYPE, RO["Folder"]), ], ) + # ORE description of ro:Folder, saved separately coll_b = dir_bundle.entity( dir_id, @@ -473,6 +498,20 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: coll.add_attributes(coll_attribs) coll_b.add_attributes(coll_b_attribs) + # Propagate input data annotations + if SCHEMA["Dataset"].uri in value: + # coll_annotations = [ (PROV_TYPE, SCHEMA["Dataset"]) ] + coll.add_attributes([ (PROV_TYPE, SCHEMA["Dataset"]) ]) + + dataset = value[SCHEMA["Dataset"].uri] + + for annotation in dataset: + if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types + coll.add_attributes({annotation: dataset[annotation]}) + + if "format" in value: + coll.add_attributes({SCHEMA["encodingFormat"]: value["format"]}) + # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() ore_doc.add_namespace(ORE) From 7a509b66d6fdef729cd06062f232dde8d28b8c92 Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Mon, 27 Jun 2022 07:32:05 +0200 Subject: [PATCH 05/14] update provenance_profile --- cwltool/provenance_profile.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index a10f69df4..712974417 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -508,9 +508,7 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: for annotation in dataset: if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types coll.add_attributes({annotation: dataset[annotation]}) - - if "format" in value: - coll.add_attributes({SCHEMA["encodingFormat"]: value["format"]}) + # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() From 79f8b74ef3918c17556f7e68d31a0d061b210fe7 Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Sun, 3 Jul 2022 22:53:08 +0200 Subject: [PATCH 06/14] added support for file arrays + https://schema.org --- cwltool/provenance_profile.py | 62 +++++++++++++++-------------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 712974417..d5098c40a 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -76,6 +76,7 @@ def copy_job_order( return customised_job + class ProvenanceProfile: """ Provenance profile. @@ -296,24 +297,23 @@ def record_process_end( self.generate_output_prov(outputs, process_run_id, process_name) self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) - - - # def _add_nested_annotations(dataset, e: ProvEntity) -> ProvEntity: - # for annotation in dataset: - # if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types - # e.add_attributes({annotation: dataset[annotation]}) - # else: - # nested_id = uuid.uuid4().urn - # # e.add_attributes({annotation: nested_id}) - # nested_entity = self.document.entity(nested_id) - # e.add_attributes({annotation: nested_entity.identifier}) - # nested_entity = _add_nested_annotations(dataset[annotation], nested_entity) - # return e - - # def _propagate_input_annotations(entity): - # entity.add_attributes( {PROV_TYPE: SCHEMA["Dataset"]}) - # entity = _add_nested_annotations(value[SCHEMA["Dataset"].uri], entity) - # return entity + def _add_nested_annotations(self, annotation_key, annotation_value, e: ProvEntity) -> ProvEntity: + """Propagate input data annotations to provenance.""" + # Change https:// into http:// first + schema2_uri = "https://schema.org/" + if schema2_uri in annotation_key: + annotation_key = SCHEMA[annotation_key.replace(schema2_uri, '')].uri + + if not isinstance(annotation_value, (MutableSequence, MutableMapping)): + e.add_attributes({annotation_key: str(annotation_value)}) + else: + nested_id = uuid.uuid4().urn + nested_entity = self.document.entity(nested_id) + e.add_attributes({annotation_key: nested_entity.identifier}) + for nested_key in annotation_value: + nested_value = annotation_value[nested_key] + nested_entity = self._add_nested_annotations(nested_key, nested_value, nested_entity) + return e def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]: if value["class"] != "File": @@ -369,24 +369,16 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) self.document.specializationOf(file_entity, entity) - + # Identify all schema annotations + schema_annotations = dict([(v, value[v]) for v in value.keys() if 'schema.org' in v]) - def _add_nested_annotations(dataset, e: ProvEntity) -> ProvEntity: - for annotation in dataset: - if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types - e.add_attributes({annotation: dataset[annotation]}) - else: - nested_id = uuid.uuid4().urn - # e.add_attributes({annotation: nested_id}) - nested_entity = self.document.entity(nested_id) - e.add_attributes({annotation: nested_entity.identifier}) - nested_entity = _add_nested_annotations(dataset[annotation], nested_entity) - return e - - # Transfer input data annotations to provenance: - if SCHEMA["Dataset"].uri in value: # TODO: modify so both http:/ and https:/ are recognized - entity.add_attributes( {PROV_TYPE: SCHEMA["Dataset"]}) - entity = _add_nested_annotations(value[SCHEMA["Dataset"].uri], entity) + # Transfer SCHEMA annotations to provenance + for s in schema_annotations: + if "additionalType" in s: + additional_type = schema_annotations[s].split(sep='/')[-1] # find better method? + entity.add_attributes( {PROV_TYPE: SCHEMA[additional_type]}) + else: + entity = self._add_nested_annotations(s, schema_annotations[s], entity) # Transfer format annotations to provenance: if "format" in value: From bf34ca6b8b6956c4f2f1116f7949ee765c5c2421 Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Mon, 4 Jul 2022 10:32:22 +0200 Subject: [PATCH 07/14] Directory annotations propagated to provenance --- cwltool/provenance_profile.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index d5098c40a..a17b3c919 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -311,7 +311,7 @@ def _add_nested_annotations(self, annotation_key, annotation_value, e: ProvEntit nested_entity = self.document.entity(nested_id) e.add_attributes({annotation_key: nested_entity.identifier}) for nested_key in annotation_value: - nested_value = annotation_value[nested_key] + nested_value = annotation_value[nested_key] nested_entity = self._add_nested_annotations(nested_key, nested_value, nested_entity) return e @@ -377,7 +377,7 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st if "additionalType" in s: additional_type = schema_annotations[s].split(sep='/')[-1] # find better method? entity.add_attributes( {PROV_TYPE: SCHEMA[additional_type]}) - else: + else: # add support for CommentedSeq entity = self._add_nested_annotations(s, schema_annotations[s], entity) # Transfer format annotations to provenance: @@ -489,18 +489,17 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: coll.add_attributes(coll_attribs) coll_b.add_attributes(coll_b_attribs) + + # Identify all schema annotations + schema_annotations = dict([(v, value[v]) for v in value.keys() if 'schema.org' in v]) - # Propagate input data annotations - if SCHEMA["Dataset"].uri in value: - # coll_annotations = [ (PROV_TYPE, SCHEMA["Dataset"]) ] - coll.add_attributes([ (PROV_TYPE, SCHEMA["Dataset"]) ]) - - dataset = value[SCHEMA["Dataset"].uri] - - for annotation in dataset: - if isinstance(dataset[annotation], (str, bool, int, float)): # check if these are all allowed types - coll.add_attributes({annotation: dataset[annotation]}) - + # Transfer SCHEMA annotations to provenance + for s in schema_annotations: + if "additionalType" in s: + additional_type = schema_annotations[s].split(sep='/')[-1] # find better method? + coll.add_attributes( {PROV_TYPE: SCHEMA[additional_type]}) + elif "hasPart" not in s: + coll = self._add_nested_annotations(s, schema_annotations[s], coll) # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() From 159895991b5e5fe51a4f98f72bdde2678dee5d1b Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Tue, 5 Jul 2022 10:10:09 +0200 Subject: [PATCH 08/14] Added support for CommentedSeq --- cwltool/provenance_profile.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index a17b3c919..70edb182d 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -248,6 +248,18 @@ def evaluate( self.prospective_prov(job) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) + # if CWLPROV['prov'].uri in job_order_object: # maybe move this to another place + # metadata = job_order_object[CWLPROV['prov'].uri] # change uri to CWLPROV['prov'].uri + # for item in metadata: + # # make a new entity with id + # # give it type additionalType value + # # add nested annotations + # # how much of this can we reuse from _add_nested_annotations? + # # how do we identify the correct file to write to? self.workflow_run_uri? + # # + # pass + + def record_process_start( self, process: Process, job: JobsType, process_run_id: Optional[str] = None @@ -306,6 +318,9 @@ def _add_nested_annotations(self, annotation_key, annotation_value, e: ProvEntit if not isinstance(annotation_value, (MutableSequence, MutableMapping)): e.add_attributes({annotation_key: str(annotation_value)}) + elif isinstance(annotation_value, MutableSequence): + for item_value in annotation_value: + e = self._add_nested_annotations(annotation_key, item_value, e) else: nested_id = uuid.uuid4().urn nested_entity = self.document.entity(nested_id) From 70d184fd4e465ed15e35ae2f5309edd27e6e273b Mon Sep 17 00:00:00 2001 From: RenskeW <64646852+RenskeW@users.noreply.github.com> Date: Tue, 5 Jul 2022 10:16:17 +0200 Subject: [PATCH 09/14] Transferred annotations from data: to id: entity --- cwltool/provenance_profile.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 70edb182d..e27dcbc86 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -391,13 +391,13 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st for s in schema_annotations: if "additionalType" in s: additional_type = schema_annotations[s].split(sep='/')[-1] # find better method? - entity.add_attributes( {PROV_TYPE: SCHEMA[additional_type]}) - else: # add support for CommentedSeq - entity = self._add_nested_annotations(s, schema_annotations[s], entity) + file_entity.add_attributes( {PROV_TYPE: SCHEMA[additional_type]}) + else: + file_entity = self._add_nested_annotations(s, schema_annotations[s], file_entity) # Transfer format annotations to provenance: if "format" in value: - entity.add_attributes({SCHEMA["encodingFormat"]: value["format"]}) + file_entity.add_attributes({SCHEMA["encodingFormat"]: value["format"]}) # Check for secondaries for sec in cast( From 690900230ce0e5252b0effc38ec95c2fc9a610c6 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Wed, 14 Dec 2022 13:27:44 +0100 Subject: [PATCH 10/14] formatting --- cwltool/provenance_profile.py | 53 +++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index d61193f71..1eec5bb6b 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -78,7 +78,6 @@ def copy_job_order( return customised_job - class ProvenanceProfile: """ Provenance profile. @@ -261,11 +260,9 @@ def evaluate( # # add nested annotations # # how much of this can we reuse from _add_nested_annotations? # # how do we identify the correct file to write to? self.workflow_run_uri? - # # + # # # pass - - def record_process_start( self, process: Process, job: JobsType, process_run_id: Optional[str] = None ) -> Optional[str]: @@ -314,14 +311,16 @@ def record_process_end( self.generate_output_prov(outputs, process_run_id, process_name) self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) - def _add_nested_annotations(self, annotation_key, annotation_value, e: ProvEntity) -> ProvEntity: + def _add_nested_annotations( + self, annotation_key, annotation_value, e: ProvEntity + ) -> ProvEntity: """Propagate input data annotations to provenance.""" # Change https:// into http:// first - schema2_uri = "https://schema.org/" + schema2_uri = "https://schema.org/" if schema2_uri in annotation_key: - annotation_key = SCHEMA[annotation_key.replace(schema2_uri, '')].uri - - if not isinstance(annotation_value, (MutableSequence, MutableMapping)): + annotation_key = SCHEMA[annotation_key.replace(schema2_uri, "")].uri + + if not isinstance(annotation_value, (MutableSequence, MutableMapping)): e.add_attributes({annotation_key: str(annotation_value)}) elif isinstance(annotation_value, MutableSequence): for item_value in annotation_value: @@ -331,8 +330,10 @@ def _add_nested_annotations(self, annotation_key, annotation_value, e: ProvEntit nested_entity = self.document.entity(nested_id) e.add_attributes({annotation_key: nested_entity.identifier}) for nested_key in annotation_value: - nested_value = annotation_value[nested_key] - nested_entity = self._add_nested_annotations(nested_key, nested_value, nested_entity) + nested_value = annotation_value[nested_key] + nested_entity = self._add_nested_annotations( + nested_key, nested_value, nested_entity + ) return e def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]: @@ -396,15 +397,21 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st self.document.specializationOf(file_entity, entity) # Identify all schema annotations - schema_annotations = dict([(v, value[v]) for v in value.keys() if 'schema.org' in v]) + schema_annotations = dict( + [(v, value[v]) for v in value.keys() if "schema.org" in v] + ) # Transfer SCHEMA annotations to provenance for s in schema_annotations: if "additionalType" in s: - additional_type = schema_annotations[s].split(sep='/')[-1] # find better method? - file_entity.add_attributes( {PROV_TYPE: SCHEMA[additional_type]}) - else: - file_entity = self._add_nested_annotations(s, schema_annotations[s], file_entity) + additional_type = schema_annotations[s].split(sep="/")[ + -1 + ] # find better method? + file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) + else: + file_entity = self._add_nested_annotations( + s, schema_annotations[s], file_entity + ) # Transfer format annotations to provenance: if "format" in value: @@ -517,18 +524,22 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: coll.add_attributes(coll_attribs) coll_b.add_attributes(coll_b_attribs) - + # Identify all schema annotations - schema_annotations = dict([(v, value[v]) for v in value.keys() if 'schema.org' in v]) + schema_annotations = dict( + [(v, value[v]) for v in value.keys() if "schema.org" in v] + ) # Transfer SCHEMA annotations to provenance for s in schema_annotations: if "additionalType" in s: - additional_type = schema_annotations[s].split(sep='/')[-1] # find better method? - coll.add_attributes( {PROV_TYPE: SCHEMA[additional_type]}) + additional_type = schema_annotations[s].split(sep="/")[ + -1 + ] # find better method? + coll.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) elif "hasPart" not in s: coll = self._add_nested_annotations(s, schema_annotations[s], coll) - + # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() ore_doc.add_namespace(ORE) From e2009ac91bfb2bb05f366ace0d755d353156d702 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Wed, 14 Dec 2022 15:34:23 +0100 Subject: [PATCH 11/14] basic test & fix typing --- cwltool/provenance_profile.py | 10 +++++----- tests/test_provenance.py | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 1eec5bb6b..0c1445cd2 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -312,7 +312,7 @@ def record_process_end( self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) def _add_nested_annotations( - self, annotation_key, annotation_value, e: ProvEntity + self, annotation_key: str, annotation_value: Any, e: ProvEntity ) -> ProvEntity: """Propagate input data annotations to provenance.""" # Change https:// into http:// first @@ -398,13 +398,13 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st # Identify all schema annotations schema_annotations = dict( - [(v, value[v]) for v in value.keys() if "schema.org" in v] + [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")] ) # Transfer SCHEMA annotations to provenance for s in schema_annotations: if "additionalType" in s: - additional_type = schema_annotations[s].split(sep="/")[ + additional_type = cast(str, schema_annotations[s]).split(sep="/")[ -1 ] # find better method? file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) @@ -527,13 +527,13 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: # Identify all schema annotations schema_annotations = dict( - [(v, value[v]) for v in value.keys() if "schema.org" in v] + [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")] ) # Transfer SCHEMA annotations to provenance for s in schema_annotations: if "additionalType" in s: - additional_type = schema_annotations[s].split(sep="/")[ + additional_type = cast(str, schema_annotations[s]).split(sep="/")[ -1 ] # find better method? coll.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) diff --git a/tests/test_provenance.py b/tests/test_provenance.py index 4adb5e5c1..cfb80ccb8 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -81,6 +81,28 @@ def test_revsort_workflow(tmp_path: Path) -> None: check_provenance(folder) +@needs_docker +def test_revsort_label_annotations(tmp_path: Path) -> None: + """Affirm that EDAM file formats in the input object make it into CWLProv.""" + base_path = cwltool( + tmp_path, + get_data("tests/wf/revsort.cwl"), + get_data("tests/wf/revsort-job.json"), + ) + prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt" + arcp_root = find_arcp(base_path) + g = Graph() + with open(prov_file, "rb") as f: + g.parse(file=f, format="nt", publicID=arcp_root) + mime_having_objects = list(g.subjects(SCHEMA.encodingFormat)) + assert len(mime_having_objects) == 2 + for obj in mime_having_objects: + assert ( + cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value + == "https://www.iana.org/assignments/media-types/text/plain" + ) + + @needs_docker def test_nested_workflow(tmp_path: Path) -> None: check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True) From 5466f95443b4bd0c2918b34f2ffb23b38b18c0a6 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Thu, 15 Dec 2022 17:33:44 +0100 Subject: [PATCH 12/14] start of a more advanced test --- MANIFEST.in | 3 + cwltool/provenance_profile.py | 14 +- tests/test_provenance.py | 21 ++ tests/wf/adv_prov/data/pdb_query.json | 0 .../data/prepared_biolip_win_p_testing.csv | 0 .../data/prepared_biolip_win_p_training.csv | 0 .../data/sabdab_summary_all_20220527.tsv | 0 tests/wf/adv_prov/model_example_params.json | 0 tests/wf/adv_prov/niaa_wf.cwl | 186 ++++++++++++++++++ tests/wf/adv_prov/niaa_wf_job.yml | 78 ++++++++ tests/wf/adv_prov/tools/combine_features.cwl | 88 +++++++++ tests/wf/adv_prov/tools/combine_inputs.py | 0 tests/wf/adv_prov/tools/combine_labels.cwl | 63 ++++++ tests/wf/adv_prov/tools/combine_labels.py | 0 tests/wf/adv_prov/tools/decompress.cwl | 32 +++ tests/wf/adv_prov/tools/dssp.cwl | 102 ++++++++++ tests/wf/adv_prov/tools/dssp_RASA.py | 0 tests/wf/adv_prov/tools/emulated_model.py | 0 .../tools/epitope_annotation_pipeline.py | 0 .../wf/adv_prov/tools/epitope_annotations.cwl | 100 ++++++++++ tests/wf/adv_prov/tools/get_pc7_inputs.py | 0 tests/wf/adv_prov/tools/get_psp19_inputs.py | 0 .../wf/adv_prov/tools/hhm_inputs_scatter.cwl | 70 +++++++ tests/wf/adv_prov/tools/pc7_inputs.cwl | 63 ++++++ .../wf/adv_prov/tools/pdb_batch_download.cwl | 88 +++++++++ tests/wf/adv_prov/tools/pdb_batch_download.sh | 0 tests/wf/adv_prov/tools/pdb_query.cwl | 76 +++++++ tests/wf/adv_prov/tools/pdb_query.py | 0 tests/wf/adv_prov/tools/ppi_annotations.cwl | 77 ++++++++ tests/wf/adv_prov/tools/ppi_annotations.py | 0 tests/wf/adv_prov/tools/process_sabdab.cwl | 67 +++++++ .../adv_prov/tools/process_sabdab_summary.py | 0 tests/wf/adv_prov/tools/psp19_inputs.cwl | 54 +++++ .../wf/adv_prov/tools/train_epitope_model.cwl | 68 +++++++ 34 files changed, 1246 insertions(+), 4 deletions(-) create mode 100644 tests/wf/adv_prov/data/pdb_query.json create mode 100644 tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv create mode 100644 tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv create mode 100644 tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv create mode 100644 tests/wf/adv_prov/model_example_params.json create mode 100644 tests/wf/adv_prov/niaa_wf.cwl create mode 100644 tests/wf/adv_prov/niaa_wf_job.yml create mode 100644 tests/wf/adv_prov/tools/combine_features.cwl create mode 100644 tests/wf/adv_prov/tools/combine_inputs.py create mode 100644 tests/wf/adv_prov/tools/combine_labels.cwl create mode 100644 tests/wf/adv_prov/tools/combine_labels.py create mode 100644 tests/wf/adv_prov/tools/decompress.cwl create mode 100644 tests/wf/adv_prov/tools/dssp.cwl create mode 100644 tests/wf/adv_prov/tools/dssp_RASA.py create mode 100644 tests/wf/adv_prov/tools/emulated_model.py create mode 100644 tests/wf/adv_prov/tools/epitope_annotation_pipeline.py create mode 100644 tests/wf/adv_prov/tools/epitope_annotations.cwl create mode 100644 tests/wf/adv_prov/tools/get_pc7_inputs.py create mode 100644 tests/wf/adv_prov/tools/get_psp19_inputs.py create mode 100644 tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl create mode 100644 tests/wf/adv_prov/tools/pc7_inputs.cwl create mode 100644 tests/wf/adv_prov/tools/pdb_batch_download.cwl create mode 100644 tests/wf/adv_prov/tools/pdb_batch_download.sh create mode 100644 tests/wf/adv_prov/tools/pdb_query.cwl create mode 100644 tests/wf/adv_prov/tools/pdb_query.py create mode 100644 tests/wf/adv_prov/tools/ppi_annotations.cwl create mode 100644 tests/wf/adv_prov/tools/ppi_annotations.py create mode 100644 tests/wf/adv_prov/tools/process_sabdab.cwl create mode 100644 tests/wf/adv_prov/tools/process_sabdab_summary.py create mode 100644 tests/wf/adv_prov/tools/psp19_inputs.cwl create mode 100644 tests/wf/adv_prov/tools/train_epitope_model.cwl diff --git a/MANIFEST.in b/MANIFEST.in index f314e9ae2..0939a4cc2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,6 +9,9 @@ include tests/loop/* include tests/tmp1/tmp2/tmp3/.gitkeep include tests/tmp4/alpha/* include tests/wf/* +include tests/wf/adv_prov/* +include tests/wf/adv_prov/data/* +include tests/wf/adv_prov/tools/* include tests/wf/operation/* include tests/override/* include tests/reloc/*.cwl diff --git a/cwltool/provenance_profile.py b/cwltool/provenance_profile.py index 0c1445cd2..158144a64 100644 --- a/cwltool/provenance_profile.py +++ b/cwltool/provenance_profile.py @@ -404,10 +404,16 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st # Transfer SCHEMA annotations to provenance for s in schema_annotations: if "additionalType" in s: - additional_type = cast(str, schema_annotations[s]).split(sep="/")[ - -1 - ] # find better method? - file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) + atype = schema_annotations[s] + if isinstance(atype, str): + additional_type = atype.split(sep="/")[-1] # find better method? + file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) + else: + for a_entry in cast(List[str], atype): + additional_type = a_entry.split(sep="/")[ + -1 + ] # find better method? + file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]}) else: file_entity = self._add_nested_annotations( s, schema_annotations[s], file_entity diff --git a/tests/test_provenance.py b/tests/test_provenance.py index cfb80ccb8..a801d2eeb 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -103,6 +103,27 @@ def test_revsort_label_annotations(tmp_path: Path) -> None: ) +def test_advanced_prov_annotations(tmp_path: Path) -> None: + """Pass through of advanced input annotations.""" + base_path = cwltool( + tmp_path, + get_data("tests/wf/adv_prov/niaa_wf.cwl"), + get_data("tests/wf/adv_prov/niaa_wf_job.yml"), + ) + prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt" + arcp_root = find_arcp(base_path) + g = Graph() + with open(prov_file, "rb") as f: + g.parse(file=f, format="nt", publicID=arcp_root) + mime_having_objects = list(g.subjects(SCHEMA.encodingFormat)) + assert len(mime_having_objects) == 8 + # for obj in mime_having_objects: + # assert ( + # cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value + # == "https://www.iana.org/assignments/media-types/text/plain" + # ) + + @needs_docker def test_nested_workflow(tmp_path: Path) -> None: check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True) diff --git a/tests/wf/adv_prov/data/pdb_query.json b/tests/wf/adv_prov/data/pdb_query.json new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv b/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/model_example_params.json b/tests/wf/adv_prov/model_example_params.json new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/niaa_wf.cwl b/tests/wf/adv_prov/niaa_wf.cwl new file mode 100644 index 000000000..fc45dd88d --- /dev/null +++ b/tests/wf/adv_prov/niaa_wf.cwl @@ -0,0 +1,186 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: Workflow + +intent: [ edam:operation_2423 ] # Prediction ope +doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction." + +requirements: + ScatterFeatureRequirement: {} + StepInputExpressionRequirement: {} + SubworkflowFeatureRequirement: {} + +inputs: + sabdab_summary: + type: File + format: iana:text/tab-separated-values + doc: "SAbDAb Summary metadata about all structures in the database." + biodl_train_dataset: + type: File + format: iana:text/csv + doc: "BioDL training dataset containing PPI interactions" + biodl_test_dataset: + type: File + doc: "BioDL test dataset with PPI interactions." + hhblits_db: + type: Directory + doc: "Reference database for HHblits" + hhblits_db_name: + type: string + doc: "Name of hhblits reference database" + pdb_search_api_query: + type: File + format: iana:application/json + doc: "Structured query for PDB API." + +outputs: + model_output: + type: File + outputSource: train_epitope_prediction_model/train_log + doc: "Output of the prediction model." + +steps: + run_pdb_query: + in: + pdb_search_query: pdb_search_api_query + out: + [ processed_response ] + run: ./tools/pdb_query.cwl + doc: | + Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements. + See https://search.rcsb.org/index.html#search-api for a tutorial. + + download_pdb_files: + in: + input_file: run_pdb_query/processed_response + mmcif_format: { default: True } + pdb_format: { default: True } + out: + [ pdb_files ] + run: ./tools/pdb_batch_download.cwl + + decompress_pdb_files: + in: + pdb_archives: download_pdb_files/pdb_files + out: [ cifs, pdbs ] + run: ./tools/decompress.cwl + doc: "Decompress files using gzip" + + generate_dssp_labels: + in: + pdb_files: decompress_pdb_files/pdbs # change this later + rsa_cutoff: { default : 0.06 } + out: [ dssp_output_files ] + run: ./tools/dssp.cwl + doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files." + + generate_ppi_labels: + in: + mmcif_files: decompress_pdb_files/cifs + train_dataset: biodl_train_dataset + test_dataset: biodl_test_dataset + out: [ ppi_fasta_files ] + run: ./tools/ppi_annotations.cwl + doc: "Extract ppi annotations from BioDL. This step is partly emulated." + + preprocess_sabdab_data: + doc: "Extract antigen chains from SAbDab summary file." + in: + sabdab_summary: sabdab_summary + out: [ processed_summary ] + run: ./tools/process_sabdab.cwl + + generate_epitope_labels: + in: + mmcif_files: decompress_pdb_files/cifs + sabdab_processed: preprocess_sabdab_data/processed_summary + out: [ epitope_fasta_dir ] + run: ./tools/epitope_annotations.cwl + doc: "Extract epitope annotations from PDB files." + + combine_labels: + doc: "Combine labels into 1 file per protein sequence." + run: ./tools/combine_labels.cwl + in: + epitope_directory: generate_epitope_labels/epitope_fasta_dir + ppi_directory: generate_ppi_labels/ppi_fasta_files + dssp_directory: generate_dssp_labels/dssp_output_files + out: [ labels_combined ] + + generate_pc7: + doc: Calculate PC7 features for each residue in each protein sequence. + run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input + in: + fasta: generate_ppi_labels/ppi_fasta_files + out: [ pc7_features ] + + generate_psp19: + label: Calculate PSP19 features for each residue in each protein sequence. + run: ./tools/psp19_inputs.cwl + in: + fasta: generate_ppi_labels/ppi_fasta_files + out: [ psp19_features ] + + generate_hhm: + in: + query_sequences: + source: generate_ppi_labels/ppi_fasta_files # type Directory + valueFrom: $(self.listing) # here type Directory is converted to File array + hhblits_db: hhblits_db + hhblits_db_name: hhblits_db_name + hhblits_n_iterations: { default: 1 } + out: [ hhm_file_array ] + run: + class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File + inputs: + query_sequences: File[] + hhblits_db: Directory + hhblits_db_name: string + hhblits_n_iterations: int + outputs: + hhm_file_array: + type: File[] + outputSource: run_hhblits/hhm + steps: + run_hhblits: + in: + protein_query_sequence: query_sequences + database: hhblits_db + database_name: hhblits_db_name + n_iterations: hhblits_n_iterations + out: [ hhm ] + scatter: protein_query_sequence + run: ./tools/hhm_inputs_scatter.cwl + combine_features: + in: + input_sequences: generate_ppi_labels/ppi_fasta_files + pc7_features: generate_pc7/pc7_features + psp19_features: generate_psp19/psp19_features + hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory + out: [ combined_features ] + run: ./tools/combine_features.cwl + + train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow. + in: # in the real workflow, the configuration file would be generated as part of the workflow as well + input_features: combine_features/combined_features + input_labels: combine_labels/labels_combined + out: [ train_log ] + run: ./tools/train_epitope_model.cwl + doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet." + +$namespaces: + iana: "https://www.iana.org/assignments/media-types/" + s: "https://schema.org/" + edam: "http://edamontology.org/" + cwlprov: "https://w3id.org/cwl/prov#" + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl + +s:author: +- s:name: "Renske de Wit" + s:identifier: https://orcid.org/0000-0003-0902-0086 +- s:name: "Katharina Waury" +s:license: https://spdx.org/licenses/Apache-2.0 diff --git a/tests/wf/adv_prov/niaa_wf_job.yml b/tests/wf/adv_prov/niaa_wf_job.yml new file mode 100644 index 000000000..787274ece --- /dev/null +++ b/tests/wf/adv_prov/niaa_wf_job.yml @@ -0,0 +1,78 @@ +cwlprov:prov: + sabdab_search: + s:additionalType: s:SearchAction + s:query: "All structures" + s:endTime: 2022-05-27 + s:object: + s:name: "Structural Antibody Database" + s:citation: + s:identifier: https://doi.org/10.1093/nar/gkab1050 + s:result: sabdab_summary + s:description: "Search Action for metadata on antibody-antigen complexes in SAbDab" + + +pdb_search_api_query: + class: File + location: ./data/pdb_query.json + format: iana:application/json + s:description: "Input query for PDB search API." + s:additionalType: + - edam:data_3786 # Query script + +sabdab_summary: + class: File + path: ./data/sabdab_summary_all_20220527.tsv + format: iana:text/tab-separated-values + s:description: "Summary file downloaded from SAbDAb database, containing metadata for all structures." + s:additionalType: + - edam:data_2080 # database search results + - s:Dataset + + +biodl_train_dataset: + class: File + path: data/prepared_biolip_win_p_training.csv + #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_training.csv + format: iana:text/csv + s:description: "BioDL training set containing PPI annotations for protein sequences (UniProt IDs)" + s:name: "BioDL training dataset" + s:citation: + s:identifier: https://doi.org/10.1093/bioinformatics/btac071 + s:additionalType: + - s:Dataset + - edam:data_1277 # protein features + +biodl_test_dataset: + class: File + path: data/prepared_biolip_win_p_testing.csv + #location: https://www.ibi.vu.nl/downloads/PIPENN/PIPENN/BioDL-Datasets/prepared_biolip_win_p_testing.csv + s:description: "BioDL test set containing PPI annotations for protein sequences (UniProt IDs)." + s:name: "BioDL test dataset" + s:citation: + s:identifier: https://doi.org/10.1093/bioinformatics/btac071 + s:additionalType: + - s:Dataset + - edam:data_1277 # protein features + +hhblits_db: + class: Directory + location: ../hhblits/databases + s:citation: + s:identifier: https://doi.org/10.1038/nmeth.1818 + s:name: "pdb70" + s:description: "Directory containing HHBlits reference database." + s:additionalType: + - s:Dataset + - edam:data_0955 # data index + +hhblits_db_name: pdb70 +hhblits_n_iterations: 1 + +s:description: "Demonstration run of epitope prediction workflow. Some steps are emulated, so the results of the workflow are not yet biologically meaningful." + +$namespaces: + iana: "https://www.iana.org/assignments/media-types/" + s: "https://schema.org/" + edam: "http://edamontology.org/" + cwlprov: "https://w3id.org/cwl/prov#" + diff --git a/tests/wf/adv_prov/tools/combine_features.cwl b/tests/wf/adv_prov/tools/combine_features.cwl new file mode 100644 index 000000000..1cf62735d --- /dev/null +++ b/tests/wf/adv_prov/tools/combine_features.cwl @@ -0,0 +1,88 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool +baseCommand: bash # python3 + +label: Combine input features + +doc: | + "Combines the input features for each protein sequence into 1 file per sequence. Output is stored in a new directory." + +hints: + # DockerRequirement: + # dockerPull: amancevice/pandas:1.3.4-slim + SoftwareRequirement: + packages: + numpy: + specs: [ https://anaconda.org/conda-forge/numpy ] + version: [ "1.21.4" ] + pandas: + specs: [ https://anaconda.org/conda-forge/pandas ] + version: [ "1.3.4" ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: + listing: | + ${ + return [{"entry": {"class": "Directory", "basename": "hhm_features_dir", "listing": inputs.hhm_features}, "writable": true}] + } + +arguments: + # - $(inputs.script.path) + # - $(inputs.input_sequences.path) + # - "hhm_features_dir" + # - $(inputs.pc7_features.path) + # - $(inputs.psp19_features.path) + # - "--outdir" + # - ./$(inputs.outdir_name) # An output directory will be created in current working directory + - -c + - | + set -ex + mkdir $(inputs.outdir_name) + touch $(inputs.outdir_name)/$(inputs.input_sequences.basename) + + +inputs: + script: + type: File + default: + class: File + location: ./combine_inputs.py + input_sequences: + type: Directory + # default: + # class: Directory + # location: ../data/test_set/ppi_fasta # delete this later + hhm_features: + type: File[] + # default: + # - class: File + # location: ../final_test_run/2HKF_P.hhm + # - class: File + # location: ../final_test_run/4W6W_A.hhm + # - class: File + # location: ../final_test_run/4W6X_A.hhm + # - class: File + # location: ../final_test_run/4W6Y_A.hhm + pc7_features: + type: Directory + # default: + # class: Directory + # location: ../final_test_run/pc7_features + psp19_features: + type: Directory + # default: + # class: Directory + # location: ../final_test_run/psp19_features + outdir_name: + type: string + default: "input_features" + +outputs: + combined_features: + type: Directory + outputBinding: + glob: $(inputs.outdir_name) + diff --git a/tests/wf/adv_prov/tools/combine_inputs.py b/tests/wf/adv_prov/tools/combine_inputs.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/combine_labels.cwl b/tests/wf/adv_prov/tools/combine_labels.cwl new file mode 100644 index 000000000..157e9fd7e --- /dev/null +++ b/tests/wf/adv_prov/tools/combine_labels.cwl @@ -0,0 +1,63 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: bash # python3 + +# hints: +# DockerRequirement: +# dockerPull: amancevice/pandas:1.3.4-slim +# SoftwareRequirement: +# packages: +# pandas: +# specs: [ https://anaconda.org/conda-forge/pandas ] +# version: [ "1.3.4" ] +# python: +# version: [ "3.9.7" ] + +arguments: + # - $(inputs.script.path) + # - $(inputs.epitope_directory.path) + # - $(inputs.ppi_directory.path) + # - $(inputs.dssp_directory.path) + # - "--outdir" + # - $(inputs.output_directory) + - -c + - | + set -ex + mkdir $(inputs.output_directory) + touch $(inputs.output_directory)/$(inputs.epitope_directory.basename) + touch $(inputs.output_directory)/$(inputs.ppi_directory.basename) + touch $(inputs.output_directory)/$(inputs.dssp_directory.basename) + + +inputs: + script: + type: File + default: + class: File + location: ./combine_labels.py + epitope_directory: + type: Directory + doc: Directory with FASTA files with epitope annotations. + ppi_directory: + type: Directory + doc: Directory with FASTA files with PPI annotations. + dssp_directory: + type: Directory + doc: Directory with DSSP output files. + output_directory: + type: string + default: "./combined_labels" + +outputs: + labels_combined: + type: Directory + doc: "Directory with 1 file per sequence, containing label values for each residue" + outputBinding: + glob: $(inputs.output_directory) + + + + diff --git a/tests/wf/adv_prov/tools/combine_labels.py b/tests/wf/adv_prov/tools/combine_labels.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/decompress.cwl b/tests/wf/adv_prov/tools/decompress.cwl new file mode 100644 index 000000000..8c68ccb6e --- /dev/null +++ b/tests/wf/adv_prov/tools/decompress.cwl @@ -0,0 +1,32 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +requirements: + InitialWorkDirRequirement: + listing: $(inputs.pdb_archives) + +baseCommand: bash # gunzip + +arguments: + - -c + - | + set -ex; for file in *.gz; do mv \${file} \${file%%.gz}; done + +inputs: + pdb_archives: + type: File[] + # inputBinding: + # position: 0 + +outputs: + cifs: + type: File[] + outputBinding: + glob: "*.cif" + pdbs: + type: File[] + outputBinding: + glob: "*.pdb" + diff --git a/tests/wf/adv_prov/tools/dssp.cwl b/tests/wf/adv_prov/tools/dssp.cwl new file mode 100644 index 000000000..6279f2444 --- /dev/null +++ b/tests/wf/adv_prov/tools/dssp.cwl @@ -0,0 +1,102 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool +baseCommand: bash # python3 + +doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files." +intent: [ http://edamontology.org/operation_0320 ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: # the script takes a directory as input + listing: | + ${ + return [{"entry": {"class": "Directory", "basename": "pdb_source_dir", "listing": inputs.pdb_files}, "writable": true}] + } + +hints: + # DockerRequirement: + # dockerPull: biopython/biopython@sha256:437075df44b0c9b3da96f71040baef0086789de7edf73c81de4ace30a127a245 + SoftwareRequirement: + packages: + pandas: + version: [ "0.19.1" ] + specs: [ https://pypi.org/project/pandas/ ] + biopython: + specs: [ https://pypi.org/project/biopython/ ] + version: [ "1.75" ] + dssp: + specs: [ https://swift.cmbi.umcn.nl/gv/dssp/ ] + version: [ "2.0.4" ] # this version does not support mmCIF files + python: + version: [ "3.5" ] + +arguments: + # - $(inputs.script.path) + # - "pdb_source_dir" + # - "-o" + # - $(inputs.output_dir) + # - "-d" + # - $(inputs.dssp) + # - "-c" + # - $(inputs.rsa_cutoff) + - -c + - | + set -ex + mkdir $(inputs.output_dir) + touch $(inputs.output_dir)/$(inputs.pdb_files[0].nameroot) + + +inputs: + script: + type: File + default: + class: File + location: ./dssp_RASA.py + pdb_files: + type: File[] + doc: "Protein structures in PDB format." + output_dir: + type: string + default: "dssp_output" + dssp: + type: string + default: "dssp" # for newer dssp versions: mkdssp + rsa_cutoff: + type: float + default: 0.06 + doc: "Threshold exposed surface area for considering amino acids buried." + +outputs: + dssp_output_files: + type: Directory + outputBinding: + glob: $(inputs.output_dir) + +s:author: +- class: s:Person + s:name: "Renske de Wit" +s:license: https://spdx.org/licenses/Apache-2.0 +s:dateCreated: "2022-05-28" +s:mainEntity: + class: s:SoftwareApplication + s:license: https://spdx.org/licenses/Apache-2.0 + s:author: + - class: s:Person + s:name: "DS" + s:description: "Script which takes a directory of pdb files as input and calculates relative surface accessibility for each residue in the protein sequence." + s:basedOn: + - class: s:SoftwareApplication + s:name: "DSSP" + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl + + + diff --git a/tests/wf/adv_prov/tools/dssp_RASA.py b/tests/wf/adv_prov/tools/dssp_RASA.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/emulated_model.py b/tests/wf/adv_prov/tools/emulated_model.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/epitope_annotation_pipeline.py b/tests/wf/adv_prov/tools/epitope_annotation_pipeline.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/epitope_annotations.cwl b/tests/wf/adv_prov/tools/epitope_annotations.cwl new file mode 100644 index 000000000..7d744194e --- /dev/null +++ b/tests/wf/adv_prov/tools/epitope_annotations.cwl @@ -0,0 +1,100 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: bash # python3 + +intent: [ http://edamontology.org/operation_0320 ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: # the script takes a directory as input + listing: | + ${ + return [{"entry": {"class": "Directory", "basename": "mmcif_directory", "listing": inputs.mmcif_files}, "writable": true}] + } + +doc: | + Runs Python script which takes directory of mmCIF files as input and outputs directory of FASTA files with protein sequence + epitope annotations. + +hints: + # DockerRequirement: + # dockerImageId: pdbecif-pandas:20220620 + # dockerFile: | + # FROM docker.io/debian:stable-slim + # RUN apt-get update && apt-get install -y --no-install-recommends python3-pip + # RUN python3 -m pip install PDBeCif pandas + SoftwareRequirement: + packages: + pandas: + specs: [ https://anaconda.org/conda-forge/pandas ] + version: [ "1.2.4" ] + python: + version: [ "3.9.1" ] + pdbecif: + specs: [ https://pypi.org/project/PDBeCif/ ] + version: [ "1.5" ] + +arguments: + # - $(inputs.script.path) + # - "mmcif_directory" + # - $(inputs.sabdab_processed.path) + # - "--fasta_directory" + # - $(inputs.fasta_output_dir) + # - "--df_directory" + # - $(inputs.df_output_dir) + - -c + - | + mkdir $(inputs.fasta_output_dir) $(inputs.df_output_dir); + touch $(inputs.fasta_output_dir)/$(inputs.mmcif_files[0].basename).fasta + touch $(inputs.df_output_dir)/$(inputs.mmcif_files[0].basename).df + +inputs: + script: + type: File + default: + class: File + location: ./epitope_annotation_pipeline.py + mmcif_files: + type: File[] + doc: mmCIF file array + sabdab_processed: + format: iana:text/csv + type: File + doc: "table of PDB entries with associated H, L and antigen chain." + fasta_output_dir: + type: string + default: "./epitope_fasta" + df_output_dir: + type: string + default: "./epitope_df" + +outputs: + epitope_fasta_dir: + type: Directory + outputBinding: + glob: $(inputs.fasta_output_dir) + epitope_df_dir: + type: Directory + outputBinding: + glob: $(inputs.df_output_dir) + +s:dateCreated: 2022-05-30 + +s:mainEntity: + s:additionalType: s:SoftwareApplication + s:author: + - s:name: "Katharina Waury" + s:dateCreated: 2022-02-10 + s:programmingLanguage: Python + s:description: "Script which extracts epitope annotations and dataframes from mmCIF files." + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + iana: https://www.iana.org/assignments/media-types/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl diff --git a/tests/wf/adv_prov/tools/get_pc7_inputs.py b/tests/wf/adv_prov/tools/get_pc7_inputs.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/get_psp19_inputs.py b/tests/wf/adv_prov/tools/get_psp19_inputs.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl b/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl new file mode 100644 index 000000000..eaba7bf1a --- /dev/null +++ b/tests/wf/adv_prov/tools/hhm_inputs_scatter.cwl @@ -0,0 +1,70 @@ +cwlVersion: v1.2 +class: CommandLineTool +baseCommand: bash # hhblits + +doc: | + CommandLineTool for hhblits, part of HH-suite. See https://github.com/soedinglab/hh-suite for documentation. +hints: + SoftwareRequirement: + packages: + hhsuite: + specs: + - https://anaconda.org/bioconda/hhsuite + - https://bio.tools/hhsuite + version: [ "3.3.0" ] +# DockerRequirement: +# dockerPull: quay.io/biocontainers/hhsuite:3.3.0--py39pl5321h67e14b5_5 # this is the version opus-tass uses? + +inputs: + protein_query_sequence: + type: File + # format: [ + # edam:format_1929, # FASTA + # edam:format_3281, # A2M + # ] + database: Directory # too large to be included in RO, change later to type string = path to database + database_name: string + n_iterations: + type: int + default: 2 # change this to the correct value + + +arguments: + # - "-i" + # - $(inputs.protein_query_sequence.path) #$(inputs.fasta_dir.path)/$(inputs.protein_id).fasta + # - "-d" + # - $(inputs.database.path)/$(inputs.database_name) + # - "-o" + # - $(inputs.protein_query_sequence.nameroot).hhr + # - "-ohhm" + # - $(inputs.protein_query_sequence.nameroot).hhm + # - "-n" + # - $(inputs.n_iterations) + - -c + - | + set +ex + touch $(inputs.protein_query_sequence.nameroot).hhr + touch $(inputs.protein_query_sequence.nameroot).hhm + + +outputs: + hhm: + type: File + outputBinding: + glob: "*.hhm" + + +s:author: # Creator of this CWL document +- s:identifier: https://orcid.org/0000-0003-0902-0086 + +s:license: Apache-2.0 + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl + + diff --git a/tests/wf/adv_prov/tools/pc7_inputs.cwl b/tests/wf/adv_prov/tools/pc7_inputs.cwl new file mode 100644 index 000000000..003fef3cf --- /dev/null +++ b/tests/wf/adv_prov/tools/pc7_inputs.cwl @@ -0,0 +1,63 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool +# hints: +# DockerRequirement: +# dockerPull: amancevice/pandas:1.3.4-slim # Script needs numpy which is a dependency of pandas +# SoftwareRequirement: +# packages: +# numpy: +# specs: [ https://anaconda.org/conda-forge/numpy ] + +baseCommand: bash # python3 + +doc: PC7 features are assigned to each residue in each protein sequence. Output is a directory of files (1 per sequence). +# intent: [ http://edamontology.org/operation_0361 ] + +inputs: + script: + type: File + default: + class: File + location: ./get_pc7_inputs.py + # inputBinding: { position: 1 } + fasta: + type: Directory + format: edam:format_2200 # fasta-like (text) + # inputBinding: + # position: 2 + outdir: + type: string + # inputBinding: + # position: 3 + # prefix: -o + default: "pc7_features" + +arguments: + - -c + - | + mkdir $(inputs.outdir) + touch $(inputs.outdir)/$(inputs.fasta.nameroot) + +outputs: + pc7_features: + type: Directory + outputBinding: + glob: $(inputs.outdir) + +s:mainEntity: # add that this is a commandlinetool + s:programmingLanguage: Python + s:codeRepository: https://github.com/RenskeW/cwl-epitope/blob/b5e31d42006fd7003716f57963646d47d1154549/tools/get_pc7_inputs.py + s:isBasedOn: + - s:additionalType: s:SoftwareApplication + s:name: OPUS-TASS + s:identifier: https://bio.tools/opus-tass + +$namespaces: + s: https://schema.org/ + edam: http://edamontology.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +- https://edamontology.org/EDAM_1.25.owl diff --git a/tests/wf/adv_prov/tools/pdb_batch_download.cwl b/tests/wf/adv_prov/tools/pdb_batch_download.cwl new file mode 100644 index 000000000..97b3bde3b --- /dev/null +++ b/tests/wf/adv_prov/tools/pdb_batch_download.cwl @@ -0,0 +1,88 @@ +#!/usr/env/bin cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: touch # bash + +doc: "Download files from the PDB in a specific format." + +intent: [ http://edamontology.org/operation_2422 ] +requirements: + NetworkAccess: + networkAccess: True + +inputs: + script: + type: File + # inputBinding: + # position: 1 + default: + class: File + location: ./pdb_batch_download.sh + input_file: + doc: "Comma-separated .txt file with pdb entries to download" + type: File + format: iana:text/csv + # inputBinding: + # position: 3 + # prefix: "-f" + mmcif_format: # The last arguments specify the format in which entries will be downloaded + type: boolean + # inputBinding: + # position: 4 + # prefix: "-c" # .cif.gz + default: True + pdb_format: + type: boolean + # inputBinding: + # position: 5 + # prefix: "-p" # .pdb.gz + default: False + pdb1_format: + type: boolean + # inputBinding: + # position: 6 + # prefix: "-a" # .pdb1.gz + default: False + xml_format: + type: boolean + # inputBinding: + # position: 7 + # prefix: "-x" # .xml.gz + default: False + sfcif_format: + type: boolean + # inputBinding: + # position: 8 + # prefix: "-s" # .sf.cif.gz + default: False + mr_format: + type: boolean + # inputBinding: + # position: 9 + # prefix: "-m" # .mr.gz + default: False + mr_str_format: + type: boolean + # inputBinding: + # position: 10 + # prefix: "-r" # .mr.str.gz + default: False + +arguments: + - $(inputs.input_file.nameroot).1.cif.gz + - $(inputs.input_file.nameroot).2.cif.gz + - $(inputs.input_file.nameroot).1.pdb.gz + - $(inputs.input_file.nameroot).2.pdb.gz + +outputs: + pdb_files: + type: File[] + outputBinding: + glob: "*.gz" + doc: "Downloaded files" + + +$namespaces: + iana: https://www.iana.org/assignments/media-types/ diff --git a/tests/wf/adv_prov/tools/pdb_batch_download.sh b/tests/wf/adv_prov/tools/pdb_batch_download.sh new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/pdb_query.cwl b/tests/wf/adv_prov/tools/pdb_query.cwl new file mode 100644 index 000000000..62d34c584 --- /dev/null +++ b/tests/wf/adv_prov/tools/pdb_query.cwl @@ -0,0 +1,76 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: cat # python3 + +requirements: + NetworkAccess: + networkAccess: True + +intent: [ http://edamontology.org/operation_2421 ] # Database search + +hints: + # DockerRequirement: + # dockerPull: nyurik/alpine-python3-requests@sha256:e0553236e3ebaa240752b41b8475afb454c5ab4c17eb023a2a904637eda16cf6 + SoftwareRequirement: + packages: + python3: + version: [ 3.9.5 ] + requests: + version: [ 2.25.1 ] + +arguments: + # - $(inputs.script.path) + - $(inputs.pdb_search_query.path) + # - "--outpath" + # - $(inputs.return_file) + +stdout: $(inputs.return_file) + +inputs: + script: + type: File + default: + class: File + location: ./pdb_query.py + pdb_search_query: + type: File + label: Query for PDB search API in json format + format: iana:application/json + return_file: + type: string + label: Path to output file + default: "./pdb_ids.txt" + doc: "Comma-separated text file with PDB ids" + +outputs: + processed_response: + type: File + format: iana:text/csv + doc: Comma-separated text file with returned identifiers from PDB search API + outputBinding: + glob: $(inputs.return_file) + +# label: Query PDB search API and store output in comma-separated text file. + +doc: | + This tool invokes a Python script which uses requests library to query PDB search API and return a comma-separated file of identifiers returned by the API. + More information about PDB search API: https://search.rcsb.org/index.html + + +$namespaces: + iana: https://www.iana.org/assignments/media-types/ + s: https://www.schema.org/ + +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf + +s:author: +- s:identifier: https://orcid.org/0000-0003-0902-0086 + +s:mainEntity: + s:author: + - s:identifier: https://orcid.org/0000-0003-0902-0086 + diff --git a/tests/wf/adv_prov/tools/pdb_query.py b/tests/wf/adv_prov/tools/pdb_query.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/ppi_annotations.cwl b/tests/wf/adv_prov/tools/ppi_annotations.cwl new file mode 100644 index 000000000..2ce630b18 --- /dev/null +++ b/tests/wf/adv_prov/tools/ppi_annotations.cwl @@ -0,0 +1,77 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: bash # python3 + +doc: "Extract PPI annotations from BioDL." +intent: [ http://edamontology.org/operation_0320 ] + +hints: + # DockerRequirement: + # dockerImageId: pdbecif-pandas:20220620 + # dockerFile: | + # FROM docker.io/debian:stable-slim + # RUN apt-get update && apt-get install -y --no-install-recommends python3-pip + # RUN python3 -m pip install PDBeCif pandas + SoftwareRequirement: + packages: + pandas: + specs: [ https://anaconda.org/conda-forge/pandas ] + version: [ "1.2.4" ] + python: + version: [ "3.9.1" ] + pdbecif: + specs: [ https://pypi.org/project/PDBeCif/ ] + version: [ "1.5" ] + +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: # the script takes a directory as input + listing: + - entry: | + ${ return {"class": "Directory", + "listing": inputs.mmcif_files }; + } + entryname: mmcif_directory + # writable: true + +inputs: + script: + type: File + default: + class: File + location: ./ppi_annotations.py + mmcif_files: # the download leaves us with an array of files, but script takes type Directory --> InitialWorkdirRequirement + type: File[] + train_dataset: + type: File + doc: "BioDL training set" + test_dataset: + type: File + doc: "BioDL test set" + output_directory_name: + type: string + default: "ppi_fasta" + +arguments: +# - $(inputs.script.path) +# - "mmcif_directory" +# - $(inputs.train_dataset.path) +# - $(inputs.test_dataset.path) +# - "--outdir" +#- $(inputs.output_directory) +- -c +- | + set -ex + mkdir $(inputs.output_directory_name) + touch $(inputs.output_directory_name)/$(inputs.train_dataset.nameroot) + touch $(inputs.output_directory_name)/$(inputs.test_dataset.nameroot) + + +outputs: + ppi_fasta_files: + type: Directory + outputBinding: + glob: $(inputs.output_directory_name) diff --git a/tests/wf/adv_prov/tools/ppi_annotations.py b/tests/wf/adv_prov/tools/ppi_annotations.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/process_sabdab.cwl b/tests/wf/adv_prov/tools/process_sabdab.cwl new file mode 100644 index 000000000..2f627fa15 --- /dev/null +++ b/tests/wf/adv_prov/tools/process_sabdab.cwl @@ -0,0 +1,67 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +doc: "Preprocess SAbDab summary file." +intent: [ http://edamontology.org/operation_2409 ] + +hints: + # DockerRequirement: + # dockerPull: amancevice/pandas:1.3.4-slim + SoftwareRequirement: + packages: + python: + version: [ "3.9.7" ] + pandas: + version: [ "1.3.4" ] + +baseCommand: cat # python3 + +arguments: +# - $(inputs.script.path) + - $(inputs.sabdab_summary.path) +# - "-o" +# - $(inputs.results_name) + +stdout: $(inputs.results_name) + +inputs: + script: + type: File + default: + class: File + location: ./process_sabdab_summary.py + sabdab_summary: + type: File + label: Summary file downloaded from SAbDab. + format: iana:text/tab-separated-values + results_name: + type: string + label: Name of output file in which processed results are stored. + default: "SAbDab_protein_antigens_PDB_chains.csv" + +outputs: + processed_summary: + type: File + format: iana:text/csv + outputBinding: + glob: $(inputs.results_name) + +s:author: +- class: s:Person + s:name: "Renske de Wit" + s:identifier: https://orcid.org/0000-0003-0902-0086 +s:license: https://spdx.org/licenses/Apache-2.0 + +s:mainEntity: + class: s:SoftwareApplication + s:license: https://spdx.org/licenses/Apache-2.0 + s:author: + - class: s:Person + s:name: "Katharina Waury" + s:identifier: + +$namespaces: + iana: "https://www.iana.org/assignments/media-types/" + s: "https://schema.org/" diff --git a/tests/wf/adv_prov/tools/process_sabdab_summary.py b/tests/wf/adv_prov/tools/process_sabdab_summary.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/wf/adv_prov/tools/psp19_inputs.cwl b/tests/wf/adv_prov/tools/psp19_inputs.cwl new file mode 100644 index 000000000..0b34196ae --- /dev/null +++ b/tests/wf/adv_prov/tools/psp19_inputs.cwl @@ -0,0 +1,54 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool +# hints: +# DockerRequirement: +# dockerPull: amancevice/pandas:1.3.4-slim +# SoftwareRequirement: +# packages: +# numpy: +# specs: [ https://anaconda.org/conda-forge/numpy ] + # python: + # version: + +baseCommand: bash # python3 + +inputs: + script: + type: File + default: + class: File + location: ./get_psp19_inputs.py + # inputBinding: + # position: 1 + fasta: + type: Directory + format: edam:format_2200 + # inputBinding: + # position: 2 + outdir: + type: string + # inputBinding: + # position: 3 + # prefix: -o + default: "psp19_features" + +arguments: + - -c + - | + set -ex + mkdir $(inputs.outdir) + touch $(inputs.outdir)/$(inputs.fasta.nameroot) + +outputs: + psp19_features: + type: Directory + outputBinding: + glob: $(inputs.outdir) + +$namespaces: + edam: http://edamontology.org/ + +$schemas: +- https://edamontology.org/EDAM_1.25.owl diff --git a/tests/wf/adv_prov/tools/train_epitope_model.cwl b/tests/wf/adv_prov/tools/train_epitope_model.cwl new file mode 100644 index 000000000..141180356 --- /dev/null +++ b/tests/wf/adv_prov/tools/train_epitope_model.cwl @@ -0,0 +1,68 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: python3 + +doc: "Model training." +intent: [ http://edamontology.org/operation_2423 ] +hints: + SoftwareRequirement: + packages: + python: + version: [ "3.9" ] + tqdm: + specs: [ https://pypi.org/project/tqdm/ ] + version: [ "4.64.0" ] + tensorflow-gpu: + specs: [ https://pypi.org/project/tensorflow-gpu/ ] + version: [ 2.9.1 ] + tensorflow-addons: + specs: [ https://pypi.org/project/tensorflow-addons/ ] + version: [ "0.17.1" ] + numpy: + version: [ "1.21.5" ] + click: + version: [ "8.0.4" ] + commentjson: + specs: [ https://pypi.org/project/commentjson/ ] + version: [ "0.9.0" ] + +arguments: +- $(inputs.script.path) +- $(inputs.config_file.path) +- $(inputs.input_features.path) +- $(inputs.input_labels.path) + +inputs: + script: + type: File + default: + class: File + location: ./emulated_model.py # this is a placeholder script + config_file: + type: File + default: + class: File + location: ../model_example_params.json + doc: "Configuration file used for the model. Here: standard file, but in real workflow it should be generated from previous steps." + input_features: + type: Directory + input_labels: + type: Directory + + +stdout: "training_log.txt" + +outputs: + train_log: + type: stdout + doc: "Output of the model containing predictions and/or performance on the test set." + + + + + + + From de090f9cf9f1064bb868130d004b1afe66ebe433 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Mon, 13 Nov 2023 13:16:10 +0100 Subject: [PATCH 13/14] Renske's surname is not a typo --- Makefile | 2 +- setup.cfg | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 29716b820..5f768d6be 100644 --- a/Makefile +++ b/Makefile @@ -113,7 +113,7 @@ diff_pydocstyle_report: pydocstyle_report.txt ## codespell-check : check for common misspellings codespell-check: - @codespell $(shell git ls-files | grep -v cwltool/schemas | grep -v cwltool/jshint/ | grep -v mypy-stubs) \ + @codespell $(shell git ls-files | grep -v cwltool/schemas | grep -v cwltool/jshint/ | grep -v mypy-stubs | grep -v setup.cfg) \ || (echo Probable typo foun. Run \"make codespell-fix\" to accept suggested fixes, or add the word to the ignore list in setup.cfg ; exit 1) ## codespell-fix : fix common misspellings diff --git a/setup.cfg b/setup.cfg index 7d28a8ed6..12d240957 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,4 +9,5 @@ use_parentheses = True line_length = 88 [codespell] -ignore-words-list=ORE,ore,RO,ro,recuse +builtin = clear +ignore-words-list = ORE,ore,RO,ro,recuse From dea0b8e6e0492ad5c67a840d73034ea3921e7990 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Mon, 18 Dec 2023 12:58:39 +0100 Subject: [PATCH 14/14] store label & doc fields as prospective provenance TODO: fix intent list add/amend tests --- build-cwltool-docker.sh | 2 +- cwltool/cwlprov/provenance_profile.py | 74 +++++++++++++++------------ cwltool/singularity.py | 2 +- tests/test_provenance.py | 4 +- 4 files changed, 46 insertions(+), 36 deletions(-) diff --git a/build-cwltool-docker.sh b/build-cwltool-docker.sh index a70fdf4df..3f20df771 100755 --- a/build-cwltool-docker.sh +++ b/build-cwltool-docker.sh @@ -8,4 +8,4 @@ ${engine} run -t -v /var/run/docker.sock:/var/run/docker.sock \ -v /tmp:/tmp \ -v "$PWD":/tmp/cwltool \ quay.io/commonwl/cwltool_module /bin/sh -c \ - "apk add gcc bash git && pip install -r/tmp/cwltool/test-requirements.txt ; pytest -k 'not (test_bioconda or test_double_overwrite or test_env_filtering or test_biocontainers or test_disable_file_overwrite_without_ext or test_disable_file_creation_in_outdir_with_ext or test_write_write_conflict or test_directory_literal_with_real_inputs_inside or test_revsort_workflow or test_stdin_with_id_preset or test_no_compute_chcksum or test_packed_workflow_execution[tests/wf/count-lines1-wf.cwl-tests/wf/wc-job.json-False] or test_sequential_workflow or test_single_process_subwf_subwf_inline_step)' --ignore-glob '*test_udocker.py' -n 0 -v -rs --pyargs cwltool" + "apk add gcc bash git && pip install -r/tmp/cwltool/test-requirements.txt ; pytest -k 'not (test_bioconda or test_double_overwrite or test_env_filtering or test_biocontainers or test_disable_file_overwrite_without_ext or test_disable_file_creation_in_outdir_with_ext or test_write_write_conflict or test_directory_literal_with_real_inputs_inside or test_revsort_workflow or test_revsort_label_annotations or test_stdin_with_id_preset or test_no_compute_chcksum or test_packed_workflow_execution[tests/wf/count-lines1-wf.cwl-tests/wf/wc-job.json-False] or test_sequential_workflow or test_single_process_subwf_subwf_inline_step)' --ignore-glob '*test_udocker.py' -n 0 -v -rs --pyargs cwltool" diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py index c8ceee232..1ed42ec28 100644 --- a/cwltool/cwlprov/provenance_profile.py +++ b/cwltool/cwlprov/provenance_profile.py @@ -51,9 +51,14 @@ ) from .writablebagfile import create_job, write_bag_file # change this later +# from schema_salad.utils import convert_to_dict + + if TYPE_CHECKING: from .ro import ResearchObject +ProvType = Dict[Union[str, Identifier], Any] + def copy_job_order(job: Union[Process, JobsType], job_order_object: CWLObjectType) -> CWLObjectType: """Create copy of job object for provenance.""" @@ -177,14 +182,14 @@ def host_provenance(document: ProvDocument) -> None: # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) if self.orcid or self.full_name: - person: Dict[Union[str, Identifier], Any] = { + person: ProvType = { PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"], } if self.full_name: person["prov:label"] = self.full_name person["foaf:name"] = self.full_name - person["schema:name"] = self.full_name + person[SCHEMA["name"]] = self.full_name else: # TODO: Look up name from ORCID API? pass @@ -235,13 +240,13 @@ def evaluate( """Evaluate the nature of job.""" if not hasattr(process, "steps"): # record provenance of independent commandline tool executions - self.prospective_prov(job) + self.prospective_prov(job, process) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) create_job(research_obj, customised_job) elif hasattr(job, "workflow"): # record provenance of workflow executions - self.prospective_prov(job) + self.prospective_prov(job, process) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) # if CWLPROV['prov'].uri in job_order_object: # maybe move this to another place @@ -306,8 +311,7 @@ def _add_nested_annotations( ) -> ProvEntity: """Propagate input data annotations to provenance.""" # Change https:// into http:// first - schema2_uri = "https://schema.org/" - if schema2_uri in annotation_key: + if (schema2_uri := "https://schema.org/") in annotation_key: annotation_key = SCHEMA[annotation_key.replace(schema2_uri, "")].uri if not isinstance(annotation_value, (MutableSequence, MutableMapping)): @@ -377,9 +381,9 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st self.document.specializationOf(file_entity, entity) # Identify all schema annotations - schema_annotations = dict( - [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")] - ) + schema_annotations = { + v: value[v] for v in value.keys() if v.startswith("https://schema.org") + } # Transfer SCHEMA annotations to provenance for s in schema_annotations: @@ -509,9 +513,9 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: coll_b.add_attributes(coll_b_attribs) # Identify all schema annotations - schema_annotations = dict( - [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")] - ) + schema_annotations = { + v: value[v] for v in value.keys() if v.startswith("https://schema.org") + } # Transfer SCHEMA annotations to provenance for s in schema_annotations: @@ -571,7 +575,7 @@ def declare_artefact(self, value: Any) -> ProvEntity: self.research_object.add_uri(entity.identifier.uri) return entity - if isinstance(value, (str, str)): + if isinstance(value, str): (entity, _) = self.declare_string(value) return entity @@ -734,35 +738,39 @@ def generate_output_prov( entity, process_run_id, timestamp, None, {"prov:role": role} ) - def prospective_prov(self, job: JobsType) -> None: + def prospective_prov(self, job: JobsType, process: Process) -> None: """Create prospective prov recording as wfdesc prov:Plan.""" + prov_items: ProvType = { + PROV_TYPE: WFDESC["Workflow"] if isinstance(job, WorkflowJob) else WFDESC["Process"], + "prov:type": PROV["Plan"], + "prov:label": "Prospective provenance", + } + if "doc" in process.tool: + prov_items[SCHEMA["description"]] = process.tool["doc"] + if "label" in process.tool: + prov_items[SCHEMA["name"]] = process.tool["label"] + # # TypeError: unhashable type: 'list' + # if "intent" in process.tool: + # prov_items[SCHEMA["featureList"]] = convert_to_dict(process.tool["intent"]) + self.document.entity("wf:main", prov_items) if not isinstance(job, WorkflowJob): - # direct command line tool execution - self.document.entity( - "wf:main", - { - PROV_TYPE: WFDESC["Process"], - "prov:type": PROV["Plan"], - "prov:label": "Prospective provenance", - }, - ) return - self.document.entity( - "wf:main", - { - PROV_TYPE: WFDESC["Workflow"], - "prov:type": PROV["Plan"], - "prov:label": "Prospective provenance", - }, - ) - for step in job.steps: stepnametemp = "wf:main/" + str(step.name)[5:] stepname = urllib.parse.quote(stepnametemp, safe=":/,#") + provstep_items: ProvType = { + PROV_TYPE: WFDESC["Process"], + "prov:type": PROV["Plan"], + } + # WorkflowStep level annotations + if "doc" in step.tool: + provstep_items[SCHEMA["description"]] = step.tool["doc"] + if "label" in step.tool: + provstep_items[SCHEMA["name"]] = step.tool["label"] provstep = self.document.entity( stepname, - {PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"]}, + provstep_items, ) self.document.entity( "wf:main", diff --git a/cwltool/singularity.py b/cwltool/singularity.py index 2f590a140..1277092e3 100644 --- a/cwltool/singularity.py +++ b/cwltool/singularity.py @@ -369,7 +369,7 @@ def add_writable_file_volume( if self.inplace_update: try: os.link(os.path.realpath(volume.resolved), host_outdir_tgt) - except os.error: + except OSError: shutil.copy(volume.resolved, host_outdir_tgt) else: shutil.copy(volume.resolved, host_outdir_tgt) diff --git a/tests/test_provenance.py b/tests/test_provenance.py index 5dbe27d7c..d2cb7e0db 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -34,9 +34,11 @@ def cwltool(tmp_path: Path, *args: Any) -> Path: + out_folder = tmp_path / "out" + out_folder.mkdir() prov_folder = tmp_path / "provenance" prov_folder.mkdir() - new_args = ["--provenance", str(prov_folder)] + new_args = ["--provenance", str(prov_folder), "--outdir", str(out_folder)] new_args.extend(args) # Run within a temporary directory to not pollute git checkout tmp_dir = tmp_path / "cwltool-run"