Skip to content

Commit e4433cc

Browse files
Merge pull request #24 from workflowhub-eu/refactor-workflow
Add Snakemake workflow
2 parents 262e220 + f8a87db commit e4433cc

File tree

7 files changed

+1697
-12
lines changed

7 files changed

+1697
-12
lines changed

Dockerfile

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,29 @@
1-
# Docker container with poetry for python package management
1+
FROM python:3.11-slim
22

3-
FROM python:3.10-slim
4-
5-
# Install poetry
63
RUN pip install poetry
74

85
# Set the working directory
96
WORKDIR /app
107

11-
# Copy the pyproject.toml
8+
# Install build tools for Snakemake (gcc, make, etc.)
9+
RUN apt-get update && apt-get install -y build-essential
10+
11+
# Copy the pyproject.toml file
1212
COPY pyproject.toml /app/
1313

1414
# Install the dependencies
1515
RUN poetry install --no-root
1616

17-
# Copy the rest of the files
17+
# Copy the rest of the application files
1818
COPY . /app
1919

2020
# Install the package
2121
RUN poetry install
2222

23-
# Run the application
24-
CMD ["help"]
23+
# Install Snakemake using Poetry
24+
RUN poetry add snakemake
25+
26+
# Set the entry point for the container
2527
ENTRYPOINT ["poetry", "run"]
28+
29+
CMD ["help"]

Snakefile

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# TODO - Refactor to input args to the Snakemake file
2+
WORKFLOW_IDS = range(1,11)
3+
VERSIONS = ['1']
4+
OUTPUT_DIRS = "data"
5+
MERGED_FILE = "merged.ttl"
6+
7+
8+
def list_expected_files():
9+
files = []
10+
for wf_id in WORKFLOW_IDS:
11+
for ver in VERSIONS:
12+
files.append(f"{OUTPUT_DIRS}/{wf_id}_{ver}_ro-crate-metadata.json")
13+
return files
14+
15+
rule all:
16+
input:
17+
MERGED_FILE
18+
19+
rule source_ro_crates:
20+
output:
21+
"created_files.json"
22+
shell:
23+
"""
24+
# Create the output directory if it doesn't exist:
25+
mkdir -p {OUTPUT_DIRS}
26+
27+
# Run the source_crates script to download the RO Crate metadata:
28+
python workflowhub_graph/source_crates.py --workflow-ids 1-10 --prod --all-versions
29+
30+
# After sourcing, check which files were actually created:
31+
python workflowhub_graph/check_outputs.py --workflow-ids 1-10 --versions {VERSIONS} --output-dir {OUTPUT_DIRS}
32+
"""
33+
34+
rule report_created_files:
35+
input:
36+
"created_files.json"
37+
shell:
38+
"""
39+
echo "Files created:"
40+
cat created_files.json
41+
"""
42+
43+
rule merge_files:
44+
input:
45+
"created_files.json"
46+
output:
47+
MERGED_FILE
48+
run:
49+
import json
50+
import os
51+
52+
# Load the list of created files:
53+
with open("created_files.json") as f:
54+
created_files = json.load(f)
55+
56+
files_to_merge = [f"data/{os.path.basename(file)}" for file in created_files]
57+
58+
# If no files are available to merge, raise an exception:
59+
if not files_to_merge:
60+
raise ValueError("No files in to merge in data directory.")
61+
62+
file_patterns = " ".join(files_to_merge)
63+
64+
# Merge the JSON-LD files into a single RDF graph and output as a TTL file
65+
shell(f"""
66+
python workflowhub_graph/merge.py {output[0]} -p "data/*.json"
67+
""")

0 commit comments

Comments
 (0)