Skip to content

Commit 42bf463

Browse files
authored
Merge pull request #19406 from nsoranzo/TarFile.extraction_filter
Set safe default extraction filter for tar archives
2 parents 371242c + 54df792 commit 42bf463

File tree

14 files changed

+92
-65
lines changed

14 files changed

+92
-65
lines changed

lib/galaxy/datatypes/isa.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,8 @@ def groom_dataset_content(self, file_name: str) -> None:
247247
# perform extraction
248248
# For some ZIP files CompressedFile::extract() extract the file inside <output_folder>/<file_name> instead of outputing it inside <output_folder>. So we first create a temporary folder, extract inside it, and move content to final destination.
249249
temp_folder = tempfile.mkdtemp()
250-
CompressedFile(file_name).extract(temp_folder)
250+
with CompressedFile(file_name) as cf:
251+
cf.extract(temp_folder)
251252
shutil.rmtree(output_path)
252253
extracted_files = os.listdir(temp_folder)
253254
logger.debug(" ".join(extracted_files))

lib/galaxy/model/store/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3075,7 +3075,8 @@ def source_to_import_store(
30753075
if ModelStoreFormat.is_compressed(model_store_format):
30763076
try:
30773077
temp_dir = mkdtemp()
3078-
target_dir = CompressedFile(target_path).extract(temp_dir)
3078+
with CompressedFile(target_path) as cf:
3079+
target_dir = cf.extract(temp_dir)
30793080
finally:
30803081
if delete:
30813082
os.remove(target_path)

lib/galaxy/tool_util/verify/interactor.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import re
66
import shutil
77
import sys
8-
import tarfile
98
import tempfile
109
import time
1110
import urllib.parse
@@ -45,6 +44,7 @@
4544
)
4645
from galaxy.util import requests
4746
from galaxy.util.bunch import Bunch
47+
from galaxy.util.compression_utils import CompressedFile
4848
from galaxy.util.hash_util import (
4949
memory_bound_hexdigest,
5050
parse_checksum_hash,
@@ -434,7 +434,14 @@ def test_data_path(self, tool_id, filename, tool_version=None):
434434
return result
435435
raise Exception(result["err_msg"])
436436

437-
def test_data_download(self, tool_id, filename, mode="file", is_output=True, tool_version=None):
437+
def test_data_download(
438+
self,
439+
tool_id: str,
440+
filename: str,
441+
mode: Literal["directory", "file"] = "file",
442+
is_output: bool = True,
443+
tool_version: Optional[str] = None,
444+
):
438445
result = None
439446
local_path = None
440447

@@ -453,7 +460,7 @@ def test_data_download(self, tool_id, filename, mode="file", is_output=True, too
453460
contents.extractall(path=path)
454461
else:
455462
# Galaxy < 21.01
456-
with tarfile.open(fileobj=fileobj) as tar_contents:
463+
with CompressedFile.open_tar(fileobj) as tar_contents:
457464
tar_contents.extractall(path=path)
458465
result = path
459466
else:

lib/galaxy/tools/data_fetch.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -442,8 +442,8 @@ def _decompress_target(upload_config: "UploadConfig", target: Dict[str, Any]):
442442
# fuzzy_root to False to literally interpret the target.
443443
fuzzy_root = target.get("fuzzy_root", True)
444444
temp_directory = os.path.abspath(tempfile.mkdtemp(prefix=elements_from_name, dir=upload_config.working_directory))
445-
cf = CompressedFile(elements_from_path)
446-
result = cf.extract(temp_directory)
445+
with CompressedFile(elements_from_path) as cf:
446+
result = cf.extract(temp_directory)
447447
return result if fuzzy_root else temp_directory
448448

449449

lib/galaxy/tools/imp_exp/unpack_tar_gz_archive.py

+3-14
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from galaxy.files import ConfiguredFileSources
1818
from galaxy.files.uris import stream_url_to_file
19+
from galaxy.util.compression_utils import CompressedFile
1920

2021
# Set max size of archive/file that will be handled to be 100 GB. This is
2122
# arbitrary and should be adjusted as needed.
@@ -52,19 +53,6 @@ def check_archive(archive_file, dest_dir):
5253
return True
5354

5455

55-
def unpack_archive(archive_file, dest_dir):
56-
"""
57-
Unpack a tar and/or gzipped archive into a destination directory.
58-
"""
59-
if zipfile.is_zipfile(archive_file):
60-
with zipfile.ZipFile(archive_file, "r") as zip_archive:
61-
zip_archive.extractall(path=dest_dir)
62-
else:
63-
archive_fp = tarfile.open(archive_file, mode="r")
64-
archive_fp.extractall(path=dest_dir)
65-
archive_fp.close()
66-
67-
6856
def main(options, args):
6957
is_url = bool(options.is_url)
7058
is_file = bool(options.is_file)
@@ -84,7 +72,8 @@ def main(options, args):
8472

8573
# Unpack archive.
8674
check_archive(archive_file, dest_dir)
87-
unpack_archive(archive_file, dest_dir)
75+
with CompressedFile(archive_file) as cf:
76+
cf.archive.extractall(dest_dir)
8877

8978

9079
if __name__ == "__main__":

lib/galaxy/util/compression_utils.py

+43-8
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import tarfile
99
import tempfile
1010
import zipfile
11+
from types import TracebackType
1112
from typing import (
1213
Any,
1314
cast,
@@ -18,10 +19,14 @@
1819
Optional,
1920
overload,
2021
Tuple,
22+
Type,
2123
Union,
2224
)
2325

24-
from typing_extensions import Literal
26+
from typing_extensions import (
27+
Literal,
28+
Self,
29+
)
2530

2631
from galaxy.util.path import (
2732
safe_relpath,
@@ -167,12 +172,16 @@ def decompress_bytes_to_directory(content: bytes) -> str:
167172
with tempfile.NamedTemporaryFile(delete=False) as fp:
168173
fp.write(content)
169174
fp.close()
170-
return CompressedFile(fp.name).extract(temp_directory)
175+
with CompressedFile(fp.name) as cf:
176+
outdir = cf.extract(temp_directory)
177+
return outdir
171178

172179

173180
def decompress_path_to_directory(path: str) -> str:
174181
temp_directory = tempfile.mkdtemp()
175-
return CompressedFile(path).extract(temp_directory)
182+
with CompressedFile(path) as cf:
183+
outdir = cf.extract(temp_directory)
184+
return outdir
176185

177186

178187
class CompressedFile:
@@ -336,13 +345,24 @@ def isfile(self, member: ArchiveMemberType) -> bool:
336345
return True
337346
return False
338347

339-
def open_tar(self, filepath: StrPath, mode: Literal["a", "r", "w", "x"]) -> tarfile.TarFile:
340-
return tarfile.open(filepath, mode, errorlevel=0)
348+
@staticmethod
349+
def open_tar(file: Union[StrPath, IO[bytes]], mode: Literal["a", "r", "w", "x"] = "r") -> tarfile.TarFile:
350+
if isinstance(file, (str, os.PathLike)):
351+
tf = tarfile.open(file, mode=mode, errorlevel=0)
352+
else:
353+
tf = tarfile.open(mode=mode, fileobj=file, errorlevel=0)
354+
# Set a safe default ("data_filter") for the extraction filter if
355+
# available, reverting to Python 3.11 behavior otherwise, see
356+
# https://docs.python.org/3/library/tarfile.html#supporting-older-python-versions
357+
tf.extraction_filter = getattr(tarfile, "data_filter", (lambda member, path: member))
358+
return tf
341359

342-
def open_zip(self, filepath: StrPath, mode: Literal["a", "r", "w", "x"]) -> zipfile.ZipFile:
343-
return zipfile.ZipFile(filepath, mode)
360+
@staticmethod
361+
def open_zip(file: Union[StrPath, IO[bytes]], mode: Literal["a", "r", "w", "x"] = "r") -> zipfile.ZipFile:
362+
return zipfile.ZipFile(file, mode)
344363

345-
def zipfile_ok(self, path_to_archive: StrPath) -> bool:
364+
@staticmethod
365+
def zipfile_ok(path_to_archive: StrPath) -> bool:
346366
"""
347367
This function is a bit pedantic and not functionally necessary. It checks whether there is
348368
no file pointing outside of the extraction, because ZipFile.extractall() has some potential
@@ -356,6 +376,21 @@ def zipfile_ok(self, path_to_archive: StrPath) -> bool:
356376
return False
357377
return True
358378

379+
def __enter__(self) -> Self:
380+
return self
381+
382+
def __exit__(
383+
self,
384+
exc_type: Optional[Type[BaseException]],
385+
exc_value: Optional[BaseException],
386+
traceback: Optional[TracebackType],
387+
) -> bool:
388+
try:
389+
self.archive.close()
390+
return exc_type is None
391+
except Exception:
392+
return False
393+
359394

360395
class FastZipFile(zipfile.ZipFile):
361396
"""

lib/tool_shed/test/base/twilltestcase.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
DEFAULT_SOCKET_TIMEOUT,
5656
smart_str,
5757
)
58+
from galaxy.util.compression_utils import CompressedFile
59+
from galaxy.util.resources import as_file
5860
from galaxy_test.base.api_asserts import assert_status_code_is_ok
5961
from galaxy_test.base.api_util import get_admin_api_key
6062
from galaxy_test.base.populators import wait_on_assertion
@@ -64,7 +66,6 @@
6466
hgweb_config,
6567
xml_util,
6668
)
67-
from tool_shed.util.repository_content_util import tar_open
6869
from tool_shed.webapp.model import Repository as DbRepository
6970
from tool_shed_client.schema import (
7071
Category,
@@ -1146,7 +1147,8 @@ def add_file_to_repository(
11461147
target = os.path.basename(source)
11471148
full_target = os.path.join(temp_directory, target)
11481149
full_source = TEST_DATA_REPO_FILES.joinpath(source)
1149-
shutil.copyfile(str(full_source), full_target)
1150+
with as_file(full_source) as full_source_path:
1151+
shutil.copyfile(full_source_path, full_target)
11501152
commit_message = commit_message or "Uploaded revision with added file."
11511153
self._upload_dir_to_repository(
11521154
repository, temp_directory, commit_message=commit_message, strings_displayed=strings_displayed
@@ -1155,9 +1157,9 @@ def add_file_to_repository(
11551157
def add_tar_to_repository(self, repository: Repository, source: str, strings_displayed=None):
11561158
with self.cloned_repo(repository) as temp_directory:
11571159
full_source = TEST_DATA_REPO_FILES.joinpath(source)
1158-
tar = tar_open(full_source)
1159-
tar.extractall(path=temp_directory)
1160-
tar.close()
1160+
with full_source.open("rb") as full_source_fileobj:
1161+
with CompressedFile.open_tar(full_source_fileobj) as tar:
1162+
tar.extractall(path=temp_directory)
11611163
commit_message = "Uploaded revision with added files from tar."
11621164
self._upload_dir_to_repository(
11631165
repository, temp_directory, commit_message=commit_message, strings_displayed=strings_displayed

lib/tool_shed/test/functional/test_shed_repositories.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,8 @@ def test_repository_search(self):
228228

229229
def test_repo_tars(self):
230230
for index, repo_path in enumerate(repo_tars("column_maker")):
231-
path = CompressedFile(repo_path).extract(tempfile.mkdtemp())
231+
with CompressedFile(repo_path) as cf:
232+
path = cf.extract(tempfile.mkdtemp())
232233
tool_xml_path = os.path.join(path, "column_maker.xml")
233234
tool_source = get_tool_source(config_file=tool_xml_path)
234235
tool_version = tool_source.parse_version()

lib/tool_shed/util/repository_content_util.py

+2-19
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os
22
import shutil
3-
import tarfile
43
import tempfile
54
from typing import (
65
Optional,
@@ -9,7 +8,7 @@
98

109
import tool_shed.repository_types.util as rt_util
1110
from galaxy.tool_shed.util.hg_util import clone_repository
12-
from galaxy.util import checkers
11+
from galaxy.util.compression_utils import CompressedFile
1312
from tool_shed.dependencies.attribute_handlers import (
1413
RepositoryDependencyAttributeHandler,
1514
ToolDependencyAttributeHandler,
@@ -26,20 +25,6 @@
2625
from tool_shed.webapp.model import Repository
2726

2827

29-
def tar_open(uploaded_file):
30-
isgzip = False
31-
isbz2 = False
32-
isgzip = checkers.is_gzip(uploaded_file)
33-
if not isgzip:
34-
isbz2 = checkers.is_bz2(uploaded_file)
35-
if isgzip or isbz2:
36-
# Open for reading with transparent compression.
37-
tar = tarfile.open(uploaded_file, "r:*")
38-
else:
39-
tar = tarfile.open(uploaded_file)
40-
return tar
41-
42-
4328
def upload_tar(
4429
trans: "ProvidesRepositoriesContext",
4530
username: str,
@@ -49,14 +34,12 @@ def upload_tar(
4934
dry_run: bool = False,
5035
remove_repo_files_not_in_tar: bool = True,
5136
new_repo_alert: bool = False,
52-
tar=None,
5337
rdah: Optional[RepositoryDependencyAttributeHandler] = None,
5438
tdah: Optional[ToolDependencyAttributeHandler] = None,
5539
) -> ChangeResponseT:
5640
host = trans.repositories_hostname
5741
app = trans.app
58-
if tar is None:
59-
tar = tar_open(uploaded_file)
42+
tar = CompressedFile.open_tar(uploaded_file)
6043
rdah = rdah or RepositoryDependencyAttributeHandler(trans, unpopulate=False)
6144
tdah = tdah or ToolDependencyAttributeHandler(trans, unpopulate=False)
6245
# Upload a tar archive of files.

test/integration/test_workflow_tasks.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,13 @@ def test_export_import_invocation_with_input_as_output_sts(self):
6060

6161
def test_export_ro_crate_basic(self):
6262
ro_crate_path = self._export_invocation_to_format(extension="rocrate.zip", to_uri=False)
63-
assert CompressedFile(ro_crate_path).file_type == "zip"
63+
with CompressedFile(ro_crate_path) as cf:
64+
assert cf.file_type == "zip"
6465

6566
def test_export_ro_crate_to_uri(self):
6667
ro_crate_path = self._export_invocation_to_format(extension="rocrate.zip", to_uri=True)
67-
assert CompressedFile(ro_crate_path).file_type == "zip"
68+
with CompressedFile(ro_crate_path) as cf:
69+
assert cf.file_type == "zip"
6870

6971
def test_export_bco_basic(self):
7072
bco_path = self._export_invocation_to_format(extension="bco.json", to_uri=False)

test/unit/data/model/test_model_store.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -646,9 +646,9 @@ def test_export_invocation_to_ro_crate_archive(tmp_path):
646646
crate_zip = tmp_path / "crate.zip"
647647
with store.ROCrateArchiveModelExportStore(crate_zip, app=app, export_files="symlink") as export_store:
648648
export_store.export_workflow_invocation(workflow_invocation)
649-
compressed_file = CompressedFile(crate_zip)
650-
assert compressed_file.file_type == "zip"
651-
compressed_file.extract(tmp_path)
649+
with CompressedFile(crate_zip) as compressed_file:
650+
assert compressed_file.file_type == "zip"
651+
compressed_file.extract(tmp_path)
652652
crate_directory = tmp_path / "crate"
653653
validate_invocation_crate_directory(crate_directory)
654654

@@ -1249,7 +1249,8 @@ class Options:
12491249

12501250
def import_archive(archive_path, app, user, import_options=None):
12511251
dest_parent = mkdtemp()
1252-
dest_dir = CompressedFile(archive_path).extract(dest_parent)
1252+
with CompressedFile(archive_path) as cf:
1253+
dest_dir = cf.extract(dest_parent)
12531254

12541255
import_options = import_options or store.ImportOptions()
12551256
model_store = store.get_import_model_store_for_directory(

test/unit/tool_shed/test_shed_index.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os
22
import shutil
3-
import tarfile
43
import tempfile
54
from collections import namedtuple
65
from io import BytesIO
@@ -9,6 +8,7 @@
98
import requests
109
from whoosh import index
1110

11+
from galaxy.util.compression_utils import CompressedFile
1212
from tool_shed.util.shed_index import build_index
1313

1414
URL = "https://github.com/mvdbeek/toolshed-test-data/blob/master/toolshed_community_files.tgz?raw=true"
@@ -29,7 +29,8 @@ def community_file_dir():
2929
response = requests.get(URL)
3030
response.raise_for_status()
3131
b = BytesIO(response.content)
32-
tarfile.open(fileobj=b, mode="r:gz").extractall(extracted_archive_dir)
32+
with CompressedFile.open_tar(b) as tar:
33+
tar.extractall(extracted_archive_dir)
3334
try:
3435
yield extracted_archive_dir
3536
finally:

test/unit/util/test_compression_util.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def test_compression_safety(self):
1515
self.assert_safety("test-data/unsafe.zip", False)
1616
self.assert_safety("test-data/4.bed.zip", True)
1717
self.assert_safety("test-data/testdir.tar", True)
18+
self.assert_safety("test-data/testdir1.tar.gz", True)
1819
self.assert_safety("test-data/safetar_with_symlink.tar", True)
1920
self.assert_safety("test-data/safe_relative_symlink.tar", True)
2021

@@ -30,10 +31,12 @@ def assert_safety(self, path, expected_to_be_safe):
3031
temp_dir = tempfile.mkdtemp()
3132
try:
3233
if expected_to_be_safe:
33-
CompressedFile(path).extract(temp_dir)
34+
with CompressedFile(path) as cf:
35+
cf.extract(temp_dir)
3436
else:
3537
with self.assertRaisesRegex(Exception, "is blocked"):
36-
CompressedFile(path).extract(temp_dir)
38+
with CompressedFile(path) as cf:
39+
cf.extract(temp_dir)
3740
finally:
3841
shutil.rmtree(temp_dir, ignore_errors=True)
3942

0 commit comments

Comments
 (0)