Skip to content

Commit

Permalink
Name WDL output files in a human-readable way (#5046)
Browse files Browse the repository at this point in the history
* Pack task path info into file URIs

* Name downloaded files in WDL human-readably

* Add test checks for pretty output filenames

* Add unit test for file path selection

* Add a test to show the pack/unpack round trip failure

* Fix the pack/unpack round trip failure

* Address review comments

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
adamnovak and github-actions[bot] committed Aug 14, 2024
1 parent 7d4d074 commit b5fd307
Show file tree
Hide file tree
Showing 2 changed files with 267 additions and 82 deletions.
74 changes: 72 additions & 2 deletions src/toil/test/wdl/wdltoil_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
import shutil
import string
import subprocess
import unittest
from uuid import uuid4
Expand All @@ -13,6 +14,7 @@
import logging
import pytest

from toil.fileStores import FileID
from toil.provisioners import cluster_factory
from toil.test import (ToilTest,
needs_docker_cuda,
Expand Down Expand Up @@ -184,8 +186,22 @@ def test_miniwdl_self_test(self, extra_args: Optional[List[str]] = None) -> None
assert len(outputs['hello_caller.message_files']) == 2
for item in outputs['hello_caller.message_files']:
# All the files should be strings in the "out" directory
assert isinstance(item, str)
assert item.startswith(out_dir)
assert isinstance(item, str), "File output must be a string"
assert item.startswith(out_dir), "File output must be in the output directory"

# Look at the filename within that directory
name_in_out_dir = item[len(out_dir):]

# Ity should contain the job name of "hello", so they are human-readable.
assert "hello" in name_in_out_dir, f"File output {name_in_out_dir} should have the originating task name in it"

# And it should not contain non-human-readable content.
#
# We use a threshold number of digits as a proxy for this, but
# don't try and get around this by just rolling other random
# strings; we want these outputs to be human-readable!!!
digit_count = len([c for c in name_in_out_dir if c in string.digits])
assert digit_count < 3, f"File output {name_in_out_dir} has {digit_count} digits, which is too many to be plausibly human-readable"

assert 'hello_caller.messages' in outputs
assert outputs['hello_caller.messages'] == ["Hello, Alyssa P. Hacker!", "Hello, Ben Bitdiddle!"]
Expand Down Expand Up @@ -521,6 +537,60 @@ def test_remove_common_leading_whitespace(self):
trimmed = remove_common_leading_whitespace(expr)
assert trimmed.command == True

def test_choose_human_readable_directory(self):
"""
Test to make sure that we pick sensible but non-colliding directories to put files in.
"""

from toil.wdl.wdltoil import choose_human_readable_directory, DirectoryNamingStateDict

state: DirectoryNamingStateDict = {}

# The first time we should get apath with the task name and without the ID
first_chosen = choose_human_readable_directory("root", "taskname", "111-222-333", state)
assert first_chosen.startswith("root")
assert "taskname" in first_chosen
assert "111-222-333" not in first_chosen

# If we use the same ID we should get the same result
same_id = choose_human_readable_directory("root", "taskname", "111-222-333", state)
self.assertEqual(same_id, first_chosen)

# If we use a different ID we shoudl get a different result still obeying the constraints
diff_id = choose_human_readable_directory("root", "taskname", "222-333-444", state)
self.assertNotEqual(diff_id, first_chosen)
assert diff_id.startswith("root")
assert "taskname" in diff_id
assert "222-333-444" not in diff_id

def test_uri_packing(self):
"""
Test to make sure Toil URI packing brings through the required information.
"""

from toil.wdl.wdltoil import pack_toil_uri, unpack_toil_uri

# Set up a file
file_id = FileID("fileXYZ", 123, True)
task_path = "the_wf.the_task"
dir_id = uuid4()
file_basename = "thefile.txt"

# Pack and unpack it
uri = pack_toil_uri(file_id, task_path, dir_id, file_basename)
unpacked = unpack_toil_uri(uri)

# Make sure we got what we put in
self.assertEqual(unpacked[0], file_id)
self.assertEqual(unpacked[0].size, file_id.size)
self.assertEqual(unpacked[0].executable, file_id.executable)

self.assertEqual(unpacked[1], task_path)

# TODO: We don't make the UUIDs back into UUID objects
self.assertEqual(unpacked[2], str(dir_id))

self.assertEqual(unpacked[3], file_basename)



Expand Down
Loading

0 comments on commit b5fd307

Please sign in to comment.