Added an option to skip the metadata parsing

BelgianBiodiversityPlatform · Oct 18, 2024 · 1f3aecd · 1f3aecd
1 parent 33f2c36
commit 1f3aecd
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 8 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,4 +1,10 @@
+current (unreleased)
+--------------------
+
+- Added a new `skip_metadata` parameter to the `DwCAReader` constructor. When set to `True`, the metadata file is not read. This can be useful for example if the metadata is corrupt (invalid XML) and you don't need it.
+
 v0.16.2 (2024-08-23)
+--------------------
 
 - Fix a packaging issue that prevented the release of v0.16.1 on PyPI.
 

diff --git a/dwca/read.py b/dwca/read.py
@@ -80,7 +80,11 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, exc_traceback):
         self.close()
 
-    def __init__(self, path: str, extensions_to_ignore: List[str] = None, tmp_dir: str = None) -> None:
+    def __init__(self,
+                 path: str,
+                 extensions_to_ignore: List[str] = None,
+                 tmp_dir: str = None,
+                 skip_metadata: bool = False) -> None:
         """Open the Darwin Core Archive."""
         if extensions_to_ignore is None:
             extensions_to_ignore = []
@@ -111,8 +115,10 @@ def __init__(self, path: str, extensions_to_ignore: List[str] = None, tmp_dir: s
                 pass
 
         #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
-        #: of the archive, or `None` if the archive has no metadata.
-        self.metadata = self._parse_metadata_file()  # type: Optional[Element]
+        #: of the archive, or `None` if the archive has no metadata or if the `skip_metadata` parameter is True.
+        self.metadata = None  # type: Optional[Element]
+        if not skip_metadata:
+            self.metadata = self._parse_metadata_file()  # type: Optional[Element]
 
         #: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as::
         #:
@@ -396,7 +402,7 @@ def _parse_metadata_file(self) -> Optional[Element]:
         """Load the archive (scientific) Metadata file, parse it with\
         ElementTree and return its content (or `None` if the archive has no metadata).
 
-        :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references an non-existent
+        :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references a non-existent
         metadata file.
         """
         # If the archive has descriptor, look for the metadata filename there.

diff --git a/dwca/test/test_dwcareader.py b/dwca/test/test_dwcareader.py
@@ -67,7 +67,6 @@ def test_pd_read_chunked(self):
             for chunk in dwca.pd_read("occurrence.txt", chunksize=2):
                 assert isinstance(chunk, pd.DataFrame)
 
-
     def test_pd_read_no_data_files(self):
         with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
             with pytest.raises(NotADataFile):
@@ -259,8 +258,20 @@ def test_use_extensions(self):
             # We ignore the extension, so archive appears without
             assert not dwca.use_extensions
 
+    def test_skip_metadata_option(self):
+        """Ensure the skip_metadata option works as intended."""
+        # By default, metadata should be read and parsed
+        with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
+            assert isinstance(dwca.metadata, ET.Element)
+
+        # ... but it can be skipped with the 'skip_metadata' option
+        with DwCAReader(
+            sample_data_path("dwca-simple-test-archive.zip"), skip_metadata=True
+        ) as dwca:
+            assert dwca.metadata is None
+
     def test_default_metadata_filename(self):
-        """Ensure that metadata is found by it's default name.
+        """Ensure that metadata is found by its default name.
 
         Metadata is named "EML.xml", but no metadata attribute in Metafile.
         """
@@ -413,9 +424,9 @@ def test_simplecsv_archive_eml(self):
         """Test Archive without metafile, but containing metadata.
 
         Similar to test_simplecsv_archive, except the archive also contains a Metadata file named
-        EML.xml. This correspond to the second case on page #2 of
+        EML.xml. This corresponds to the second case on page #2 of
         http://www.gbif.org/resource/80639. The metadata file having the "standard name", it should
-        properly handled.
+        properly be handled.
         """
         with DwCAReader(sample_data_path("dwca-simple-csv-eml.zip")) as dwca:
             # Ensure we get the correct number of rows