From 5e00ffe0f6df90131f327f4e9a955a02d1aed020 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 12 Nov 2024 08:30:44 -0500 Subject: [PATCH 1/2] Allow a posix file source to prefer linking. The linking upload parameters will still be respected, but if none of them are set data fetch will default to just linking files during upload. This uses Dataset.external_filename instead of symlinks in the objectstore so that Galaxy has better tracking of the links and so this works closer to the way data libraries have always worked. --- lib/galaxy/files/sources/__init__.py | 7 ++ lib/galaxy/files/sources/posix.py | 7 ++ lib/galaxy/files/uris.py | 9 ++- lib/galaxy/tools/data_fetch.py | 77 +++++++++++++++------ lib/galaxy_test/driver/integration_setup.py | 20 +++++- test/integration/test_remote_files_posix.py | 46 ++++++++++-- 6 files changed, 138 insertions(+), 28 deletions(-) diff --git a/lib/galaxy/files/sources/__init__.py b/lib/galaxy/files/sources/__init__.py index 19fcc1b21475..fd1e84670173 100644 --- a/lib/galaxy/files/sources/__init__.py +++ b/lib/galaxy/files/sources/__init__.py @@ -277,6 +277,10 @@ def to_dict(self, for_serialization=False, user_context: "OptionalUserContext" = context doesn't need to be present after the plugin is re-hydrated. """ + @abc.abstractmethod + def prefer_links(self) -> bool: + """Prefer linking to files.""" + class SupportsBrowsing(metaclass=abc.ABCMeta): """An interface indicating that this filesource is browsable. @@ -351,6 +355,9 @@ def user_has_access(self, user_context: "OptionalUserContext") -> bool: or (self._user_has_required_roles(user_context) and self._user_has_required_groups(user_context)) ) + def prefer_links(self) -> bool: + return False + @property def user_context_required(self) -> bool: return self.requires_roles is not None or self.requires_groups is not None diff --git a/lib/galaxy/files/sources/posix.py b/lib/galaxy/files/sources/posix.py index 8768b5b5ca9c..258d5ef20a69 100644 --- a/lib/galaxy/files/sources/posix.py +++ b/lib/galaxy/files/sources/posix.py @@ -26,6 +26,7 @@ DEFAULT_ENFORCE_SYMLINK_SECURITY = True DEFAULT_DELETE_ON_REALIZE = False DEFAULT_ALLOW_SUBDIR_CREATION = True +DEFAULT_PREFER_LINKS = False class PosixFilesSourceProperties(FilesSourceProperties, total=False): @@ -33,6 +34,7 @@ class PosixFilesSourceProperties(FilesSourceProperties, total=False): enforce_symlink_security: bool delete_on_realize: bool allow_subdir_creation: bool + prefer_links: bool class PosixFilesSource(BaseFilesSource): @@ -53,6 +55,10 @@ def __init__(self, **kwd: Unpack[PosixFilesSourceProperties]): self.enforce_symlink_security = props.get("enforce_symlink_security", DEFAULT_ENFORCE_SYMLINK_SECURITY) self.delete_on_realize = props.get("delete_on_realize", DEFAULT_DELETE_ON_REALIZE) self.allow_subdir_creation = props.get("allow_subdir_creation", DEFAULT_ALLOW_SUBDIR_CREATION) + self._prefer_links = props.get("prefer_links", DEFAULT_PREFER_LINKS) + + def prefer_links(self) -> bool: + return self._prefer_links def _list( self, @@ -182,6 +188,7 @@ def _serialization_props(self, user_context: OptionalUserContext = None) -> Posi "enforce_symlink_security": self.enforce_symlink_security, "delete_on_realize": self.delete_on_realize, "allow_subdir_creation": self.allow_subdir_creation, + "prefer_links": self._prefer_links, } @property diff --git a/lib/galaxy/files/uris.py b/lib/galaxy/files/uris.py index d35e51d696c4..9d48853294d6 100644 --- a/lib/galaxy/files/uris.py +++ b/lib/galaxy/files/uris.py @@ -49,8 +49,7 @@ def stream_url_to_file( target_path: Optional[str] = None, file_source_opts: Optional[FilesSourceOptions] = None, ) -> str: - if file_sources is None: - file_sources = ConfiguredFileSources.from_dict(None, load_stock_plugins=True) + file_sources = ensure_file_sources(file_sources) file_source, rel_path = file_sources.get_file_source_path(url) if file_source: if not target_path: @@ -62,6 +61,12 @@ def stream_url_to_file( raise NoMatchingFileSource(f"Could not find a matching handler for: {url}") +def ensure_file_sources(file_sources: Optional["ConfiguredFileSources"]) -> "ConfiguredFileSources": + if file_sources is None: + file_sources = ConfiguredFileSources.from_dict(None, load_stock_plugins=True) + return file_sources + + def stream_to_file(stream, suffix="", prefix="", dir=None, text=False, **kwd): """Writes a stream to a temporary file, returns the temporary file's name""" fd, temp_name = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=dir, text=text) diff --git a/lib/galaxy/tools/data_fetch.py b/lib/galaxy/tools/data_fetch.py index bd5f1ae0b8e6..bad82540d7fa 100644 --- a/lib/galaxy/tools/data_fetch.py +++ b/lib/galaxy/tools/data_fetch.py @@ -23,6 +23,7 @@ UploadProblemException, ) from galaxy.files.uris import ( + ensure_file_sources, stream_to_file, stream_url_to_file, ) @@ -97,13 +98,13 @@ def expand_elements_from(target_or_item): decompressed_directory = _decompress_target(upload_config, target_or_item) items = _directory_to_items(decompressed_directory) elif elements_from == "bagit": - _, elements_from_path = _has_src_to_path(upload_config, target_or_item, is_dataset=False) + _, elements_from_path, _ = _has_src_to_path(upload_config, target_or_item, is_dataset=False) items = _bagit_to_items(elements_from_path) elif elements_from == "bagit_archive": decompressed_directory = _decompress_target(upload_config, target_or_item) items = _bagit_to_items(decompressed_directory) elif elements_from == "directory": - _, elements_from_path = _has_src_to_path(upload_config, target_or_item, is_dataset=False) + _, elements_from_path, _ = _has_src_to_path(upload_config, target_or_item, is_dataset=False) items = _directory_to_items(elements_from_path) else: raise Exception(f"Unknown elements from type encountered [{elements_from}]") @@ -205,7 +206,7 @@ def _resolve_item(item): pass key = keys[composite_item_idx] writable_file = writable_files[key] - _, src_target = _has_src_to_path(upload_config, composite_item) + _, src_target, _ = _has_src_to_path(upload_config, composite_item) # do the writing sniff.handle_composite_file( datatype, @@ -238,10 +239,23 @@ def _resolve_item_with_primary(item): converted_path = None deferred = upload_config.get_option(item, "deferred") + + link_data_only = upload_config.link_data_only + link_data_only_explicit = upload_config.link_data_only_explicit + if "link_data_only" in item: + # Allow overriding this on a per file basis. + link_data_only, link_data_only_explicit = _link_data_only(item) + name: str path: Optional[str] + default_in_place = False if not deferred: - name, path = _has_src_to_path(upload_config, item, is_dataset=True) + name, path, is_link = _has_src_to_path( + upload_config, item, is_dataset=True, link_data_only_explicitly_set=link_data_only_explicit + ) + if is_link: + link_data_only = True + default_in_place = True else: name, path = _has_src_to_name(item) or "Deferred Dataset", None sources = [] @@ -266,10 +280,6 @@ def _resolve_item_with_primary(item): item["error_message"] = error_message dbkey = item.get("dbkey", "?") - link_data_only = upload_config.link_data_only - if "link_data_only" in item: - # Allow overriding this on a per file basis. - link_data_only = _link_data_only(item) ext = "data" staged_extra_files = None @@ -281,7 +291,7 @@ def _resolve_item_with_primary(item): effective_state = "ok" if not deferred and not error_message: - in_place = item.get("in_place", False) + in_place = item.get("in_place", default_in_place) purge_source = item.get("purge_source", True) registry = upload_config.registry @@ -339,7 +349,7 @@ def walk_extra_files(items, prefix=""): item_prefix = os.path.join(prefix, name) walk_extra_files(item.get("elements"), prefix=item_prefix) else: - src_name, src_path = _has_src_to_path(upload_config, item) + src_name, src_path, _ = _has_src_to_path(upload_config, item) if prefix: rel_path = os.path.join(prefix, src_name) else: @@ -425,7 +435,7 @@ def _bagit_to_items(directory): def _decompress_target(upload_config: "UploadConfig", target: Dict[str, Any]): - elements_from_name, elements_from_path = _has_src_to_path(upload_config, target, is_dataset=False) + elements_from_name, elements_from_path, _ = _has_src_to_path(upload_config, target, is_dataset=False) # by default Galaxy will check for a directory with a single file and interpret that # as the new root for expansion, this is a good user experience for uploading single # files in a archive but not great from an API perspective. Allow disabling by setting @@ -483,13 +493,33 @@ def _has_src_to_name(item) -> Optional[str]: return name -def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dataset: bool = False) -> Tuple[str, str]: +def _has_src_to_path( + upload_config: "UploadConfig", + item: Dict[str, Any], + is_dataset: bool = False, + link_data_only: bool = False, + link_data_only_explicitly_set: bool = False, +) -> Tuple[str, str, bool]: assert "src" in item, item src = item.get("src") name = item.get("name") + is_link = False if src == "url": url = item.get("url") + file_sources = ensure_file_sources(upload_config.file_sources) assert url, "url cannot be empty" + if not link_data_only_explicitly_set: + file_source, rel_path = file_sources.get_file_source_path(url) + prefer_links = file_source.prefer_links() + if prefer_links: + if rel_path.startswith("/"): + rel_path = rel_path[1:] + path = os.path.abspath(os.path.join(file_source.root, rel_path)) + if name is None: + name = url.split("/")[-1] + is_link = True + return name, path, is_link + try: path = stream_url_to_file(url, file_sources=upload_config.file_sources, dir=upload_config.working_directory) except Exception as e: @@ -513,7 +543,7 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat path = item["path"] if name is None: name = os.path.basename(path) - return name, path + return name, path, is_link def _handle_hash_validation(hash_function: HashFunctionNameEnum, hash_value: str, path: str): @@ -564,7 +594,7 @@ def __init__( self.space_to_tab = request.get("space_to_tab", False) self.auto_decompress = request.get("auto_decompress", False) self.deferred = request.get("deferred", False) - self.link_data_only = _link_data_only(request) + self.link_data_only, self.link_data_only_explicit = _link_data_only(request) self.file_sources_dict = file_sources_dict self._file_sources = None @@ -616,12 +646,19 @@ def ensure_in_working_directory(self, path, purge_source, in_place): return new_path -def _link_data_only(has_config_dict): - link_data_only = has_config_dict.get("link_data_only", False) - if not isinstance(link_data_only, bool): - # Allow the older string values of 'copy_files' and 'link_to_files' - link_data_only = link_data_only == "copy_files" - return link_data_only +def _link_data_only(has_config_dict) -> Tuple[bool, bool]: + if "link_data_only" in has_config_dict: + link_data_only_raw = has_config_dict["link_data_only"] + if not isinstance(link_data_only_raw, bool): + # Allow the older string values of 'copy_files' and 'link_to_files' + link_data_only = link_data_only_raw == "copy_files" + else: + link_data_only = link_data_only_raw + link_data_only_explicit = True + else: + link_data_only = False + link_data_only_explicit = False + return link_data_only, link_data_only_explicit def _for_each_src(f, obj): diff --git a/lib/galaxy_test/driver/integration_setup.py b/lib/galaxy_test/driver/integration_setup.py index b63b567247ea..30194e63c830 100644 --- a/lib/galaxy_test/driver/integration_setup.py +++ b/lib/galaxy_test/driver/integration_setup.py @@ -15,7 +15,9 @@ REQUIRED_GROUP_EXPRESSION = f"{GROUP_A} or '{GROUP_B}'" -def get_posix_file_source_config(root_dir: str, roles: str, groups: str, include_test_data_dir: bool) -> str: +def get_posix_file_source_config( + root_dir: str, roles: str, groups: str, include_test_data_dir: bool, prefer_links: bool = False +) -> str: rval = f""" - type: posix id: posix_test @@ -26,6 +28,17 @@ def get_posix_file_source_config(root_dir: str, roles: str, groups: str, include requires_roles: {roles} requires_groups: {groups} """ + if prefer_links: + rval += f""" +- type: posix + id: linking_source + label: Posix + doc: Files from local path to links + root: {root_dir} + writable: true + prefer_links: true +""" + if include_test_data_dir: rval += """ - type: posix @@ -44,9 +57,10 @@ def create_file_source_config_file_on( include_test_data_dir, required_role_expression, required_group_expression, + prefer_links: bool = False, ): file_contents = get_posix_file_source_config( - root_dir, required_role_expression, required_group_expression, include_test_data_dir + root_dir, required_role_expression, required_group_expression, include_test_data_dir, prefer_links=prefer_links ) file_path = os.path.join(temp_dir, "file_sources_conf_posix.yml") with open(file_path, "w") as f: @@ -67,6 +81,7 @@ def handle_galaxy_config_kwds( # Require role for access but do not require groups by default on every test to simplify them required_role_expression=REQUIRED_ROLE_EXPRESSION, required_group_expression="", + prefer_links: bool = False, ): temp_dir = os.path.realpath(mkdtemp()) clazz_ = clazz_ or cls @@ -79,6 +94,7 @@ def handle_galaxy_config_kwds( clazz_.include_test_data_dir, required_role_expression, required_group_expression, + prefer_links=prefer_links, ) config["file_sources_config_file"] = file_sources_config_file diff --git a/test/integration/test_remote_files_posix.py b/test/integration/test_remote_files_posix.py index 7d588cdf9d25..0fbc4b7f8d93 100644 --- a/test/integration/test_remote_files_posix.py +++ b/test/integration/test_remote_files_posix.py @@ -1,8 +1,8 @@ -# Before running this test, start nginx+webdav in Docker using following command: -# docker run -v `pwd`/test/integration/webdav/data:/media -e WEBDAV_USERNAME=alice -e WEBDAV_PASSWORD=secret1234 -p 7083:7083 jmchilton/webdavdev -# Apache Docker host (shown next) doesn't work because displayname not set in response. -# docker run -v `pwd`/test/integration/webdav:/var/lib/dav -e AUTH_TYPE=Basic -e USERNAME=alice -e PASSWORD=secret1234 -e LOCATION=/ -p 7083:80 bytemark/webdav +import os +from sqlalchemy import select + +from galaxy.model import Dataset from galaxy_test.base import api_asserts from galaxy_test.base.populators import DatasetPopulator from galaxy_test.driver import integration_util @@ -108,3 +108,41 @@ def _assert_list_response_matches_fixtures(self, list_response): def _assert_access_forbidden_response(self, response): api_asserts.assert_status_code_is(response, 403) + + +class TestPreferLinksPosixFileSourceIntegration(PosixFileSourceSetup, integration_util.IntegrationTestCase): + dataset_populator: DatasetPopulator + + @classmethod + def handle_galaxy_config_kwds(cls, config): + PosixFileSourceSetup.handle_galaxy_config_kwds( + config, + cls, + prefer_links=True, + ) + + def setUp(self): + super().setUp() + self._write_file_fixtures() + self.dataset_populator = DatasetPopulator(self.galaxy_interactor) + + def test_links_by_default(self): + with self.dataset_populator.test_history() as history_id: + element = dict(src="url", url="gxfiles://linking_source/a") + target = { + "destination": {"type": "hdas"}, + "elements": [element], + } + targets = [target] + payload = { + "history_id": history_id, + "targets": targets, + } + new_dataset = self.dataset_populator.fetch(payload, assert_ok=True).json()["outputs"][0] + content = self.dataset_populator.get_history_dataset_content(history_id, dataset=new_dataset) + assert content == "a\n", content + stmt = select(Dataset).order_by(Dataset.create_time.desc()).limit(1) + dataset = self._app.model.session.execute(stmt).unique().scalar_one() + assert dataset.external_filename.endswith("/root/a") + assert os.path.exists(dataset.external_filename) + assert open(dataset.external_filename).read() == "a\n" From 272e9ff159a2d387262977901f861f0206808bca Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 13 Nov 2024 09:46:04 -0500 Subject: [PATCH 2/2] A test case for running tool on linnked dataset. --- lib/galaxy/config/__init__.py | 8 ++++++++ test/integration/test_remote_files_posix.py | 12 ++++++++++++ 2 files changed, 20 insertions(+) diff --git a/lib/galaxy/config/__init__.py b/lib/galaxy/config/__init__.py index d4b1170673a1..8ccc68ffb631 100644 --- a/lib/galaxy/config/__init__.py +++ b/lib/galaxy/config/__init__.py @@ -115,6 +115,14 @@ "level": "INFO", "qualname": "watchdog.observers.inotify_buffer", }, + "py.warnings": { + "level": "ERROR", + "qualname": "py.warnings", + }, + "celery.utils.functional": { + "level": "INFO", + "qualname": "celery.utils.functional", + }, }, "filters": { "stack": { diff --git a/test/integration/test_remote_files_posix.py b/test/integration/test_remote_files_posix.py index 0fbc4b7f8d93..75a449ff2e45 100644 --- a/test/integration/test_remote_files_posix.py +++ b/test/integration/test_remote_files_posix.py @@ -112,6 +112,7 @@ def _assert_access_forbidden_response(self, response): class TestPreferLinksPosixFileSourceIntegration(PosixFileSourceSetup, integration_util.IntegrationTestCase): dataset_populator: DatasetPopulator + framework_tool_and_types = True @classmethod def handle_galaxy_config_kwds(cls, config): @@ -146,3 +147,14 @@ def test_links_by_default(self): assert dataset.external_filename.endswith("/root/a") assert os.path.exists(dataset.external_filename) assert open(dataset.external_filename).read() == "a\n" + payload = self.dataset_populator.run_tool( + tool_id="cat", + inputs={ + "input1": {"src": "hda", "id": new_dataset["id"]}, + }, + history_id=history_id, + ) + derived_dataset = payload["outputs"][0] + self.dataset_populator.wait_for_history(history_id, assert_ok=True) + derived_content = self.dataset_populator.get_history_dataset_content(history_id, dataset=derived_dataset) + assert derived_content.strip() == "a"