diff --git a/CHANGES.txt b/CHANGES.txt index e5e791a..85f8879 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,10 @@ +current (unreleased) +-------------------- + +- Added a new `skip_metadata` parameter to the `DwCAReader` constructor. When set to `True`, the metadata file is not read. This can be useful for example if the metadata is corrupt (invalid XML) and you don't need it. + v0.16.2 (2024-08-23) +-------------------- - Fix a packaging issue that prevented the release of v0.16.1 on PyPI. diff --git a/dwca/read.py b/dwca/read.py index c345df9..a49b3d1 100644 --- a/dwca/read.py +++ b/dwca/read.py @@ -80,7 +80,11 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, exc_traceback): self.close() - def __init__(self, path: str, extensions_to_ignore: List[str] = None, tmp_dir: str = None) -> None: + def __init__(self, + path: str, + extensions_to_ignore: List[str] = None, + tmp_dir: str = None, + skip_metadata: bool = False) -> None: """Open the Darwin Core Archive.""" if extensions_to_ignore is None: extensions_to_ignore = [] @@ -111,8 +115,10 @@ def __init__(self, path: str, extensions_to_ignore: List[str] = None, tmp_dir: s pass #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata - #: of the archive, or `None` if the archive has no metadata. - self.metadata = self._parse_metadata_file() # type: Optional[Element] + #: of the archive, or `None` if the archive has no metadata or if the `skip_metadata` parameter is True. + self.metadata = None # type: Optional[Element] + if not skip_metadata: + self.metadata = self._parse_metadata_file() # type: Optional[Element] #: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as:: #: @@ -396,7 +402,7 @@ def _parse_metadata_file(self) -> Optional[Element]: """Load the archive (scientific) Metadata file, parse it with\ ElementTree and return its content (or `None` if the archive has no metadata). - :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references an non-existent + :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references a non-existent metadata file. """ # If the archive has descriptor, look for the metadata filename there. diff --git a/dwca/test/test_dwcareader.py b/dwca/test/test_dwcareader.py index f3616ab..15b8f92 100644 --- a/dwca/test/test_dwcareader.py +++ b/dwca/test/test_dwcareader.py @@ -67,7 +67,6 @@ def test_pd_read_chunked(self): for chunk in dwca.pd_read("occurrence.txt", chunksize=2): assert isinstance(chunk, pd.DataFrame) - def test_pd_read_no_data_files(self): with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca: with pytest.raises(NotADataFile): @@ -259,8 +258,20 @@ def test_use_extensions(self): # We ignore the extension, so archive appears without assert not dwca.use_extensions + def test_skip_metadata_option(self): + """Ensure the skip_metadata option works as intended.""" + # By default, metadata should be read and parsed + with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca: + assert isinstance(dwca.metadata, ET.Element) + + # ... but it can be skipped with the 'skip_metadata' option + with DwCAReader( + sample_data_path("dwca-simple-test-archive.zip"), skip_metadata=True + ) as dwca: + assert dwca.metadata is None + def test_default_metadata_filename(self): - """Ensure that metadata is found by it's default name. + """Ensure that metadata is found by its default name. Metadata is named "EML.xml", but no metadata attribute in Metafile. """ @@ -413,9 +424,9 @@ def test_simplecsv_archive_eml(self): """Test Archive without metafile, but containing metadata. Similar to test_simplecsv_archive, except the archive also contains a Metadata file named - EML.xml. This correspond to the second case on page #2 of + EML.xml. This corresponds to the second case on page #2 of http://www.gbif.org/resource/80639. The metadata file having the "standard name", it should - properly handled. + properly be handled. """ with DwCAReader(sample_data_path("dwca-simple-csv-eml.zip")) as dwca: # Ensure we get the correct number of rows