Skip to content

Commit

Permalink
Added an option to skip the metadata parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
niconoe committed Oct 18, 2024
1 parent 33f2c36 commit 1f3aecd
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
6 changes: 6 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
current (unreleased)
--------------------

- Added a new `skip_metadata` parameter to the `DwCAReader` constructor. When set to `True`, the metadata file is not read. This can be useful for example if the metadata is corrupt (invalid XML) and you don't need it.

v0.16.2 (2024-08-23)
--------------------

- Fix a packaging issue that prevented the release of v0.16.1 on PyPI.

Expand Down
14 changes: 10 additions & 4 deletions dwca/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ def __enter__(self):
def __exit__(self, exc_type, exc_value, exc_traceback):
self.close()

def __init__(self, path: str, extensions_to_ignore: List[str] = None, tmp_dir: str = None) -> None:
def __init__(self,
path: str,
extensions_to_ignore: List[str] = None,
tmp_dir: str = None,
skip_metadata: bool = False) -> None:
"""Open the Darwin Core Archive."""
if extensions_to_ignore is None:
extensions_to_ignore = []
Expand Down Expand Up @@ -111,8 +115,10 @@ def __init__(self, path: str, extensions_to_ignore: List[str] = None, tmp_dir: s
pass

#: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
#: of the archive, or `None` if the archive has no metadata.
self.metadata = self._parse_metadata_file() # type: Optional[Element]
#: of the archive, or `None` if the archive has no metadata or if the `skip_metadata` parameter is True.
self.metadata = None # type: Optional[Element]
if not skip_metadata:
self.metadata = self._parse_metadata_file() # type: Optional[Element]

#: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as::
#:
Expand Down Expand Up @@ -396,7 +402,7 @@ def _parse_metadata_file(self) -> Optional[Element]:
"""Load the archive (scientific) Metadata file, parse it with\
ElementTree and return its content (or `None` if the archive has no metadata).
:raises: :class:`dwca.exceptions.InvalidArchive` if the archive references an non-existent
:raises: :class:`dwca.exceptions.InvalidArchive` if the archive references a non-existent
metadata file.
"""
# If the archive has descriptor, look for the metadata filename there.
Expand Down
19 changes: 15 additions & 4 deletions dwca/test/test_dwcareader.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def test_pd_read_chunked(self):
for chunk in dwca.pd_read("occurrence.txt", chunksize=2):
assert isinstance(chunk, pd.DataFrame)


def test_pd_read_no_data_files(self):
with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
with pytest.raises(NotADataFile):
Expand Down Expand Up @@ -259,8 +258,20 @@ def test_use_extensions(self):
# We ignore the extension, so archive appears without
assert not dwca.use_extensions

def test_skip_metadata_option(self):
"""Ensure the skip_metadata option works as intended."""
# By default, metadata should be read and parsed
with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
assert isinstance(dwca.metadata, ET.Element)

# ... but it can be skipped with the 'skip_metadata' option
with DwCAReader(
sample_data_path("dwca-simple-test-archive.zip"), skip_metadata=True
) as dwca:
assert dwca.metadata is None

def test_default_metadata_filename(self):
"""Ensure that metadata is found by it's default name.
"""Ensure that metadata is found by its default name.
Metadata is named "EML.xml", but no metadata attribute in Metafile.
"""
Expand Down Expand Up @@ -413,9 +424,9 @@ def test_simplecsv_archive_eml(self):
"""Test Archive without metafile, but containing metadata.
Similar to test_simplecsv_archive, except the archive also contains a Metadata file named
EML.xml. This correspond to the second case on page #2 of
EML.xml. This corresponds to the second case on page #2 of
http://www.gbif.org/resource/80639. The metadata file having the "standard name", it should
properly handled.
properly be handled.
"""
with DwCAReader(sample_data_path("dwca-simple-csv-eml.zip")) as dwca:
# Ensure we get the correct number of rows
Expand Down

0 comments on commit 1f3aecd

Please sign in to comment.