#38: corrected bug blocking csv2df from url

unpackAI · Oct 23, 2021 · f7afc73 · f7afc73
1 parent f0de989
commit f7afc73
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 33 deletions.
diff --git a/nbs/10_utils.ipynb b/nbs/10_utils.ipynb
@@ -1096,7 +1096,10 @@
    "outputs": [],
    "source": [
     "# exportest\n",
-    "url_ar = r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/avianHabitat_sewardPeninsula_McNew_2012.zip\"\n",
+    "url_ar = (\n",
+    "    r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/\"\n",
+    "    \"avianHabitat_sewardPeninsula_McNew_2012.zip\"\n",
+    ")\n",
     "\n",
     "\n",
     "@pytest.mark.parametrize(\"url\", [url_ar, url_ar + \"?x=123\"], ids=[\"url\", \"url?x=y\"])\n",
@@ -1140,7 +1143,7 @@
     "    \"\"\"CSV in zip to DataFrame\"\"\"\n",
     "    with ZipFile(zip_path) as zf:\n",
     "        try:\n",
-    "            with zf.open(csv_path) as f_csv:\n",
+    "            with zf.open(str(csv_path)) as f_csv:\n",
     "                return pd.read_csv(f_csv)\n",
     "        except KeyError:\n",
     "            files = \"\\n\".join(f\" * {f}\" for f in zf.namelist() if f.lower().endswith(\".csv\"))\n",
@@ -1154,7 +1157,7 @@
     "    \"\"\"CSV in tar to DataFrame\"\"\"\n",
     "    with tarfile.open(tar_path) as tf:\n",
     "        try:\n",
-    "            csv_member = tf.getmember(csv_path)\n",
+    "            csv_member = tf.getmember(str(csv_path))\n",
     "            return pd.read_csv(tf.extractfile(member=csv_member))\n",
     "        except KeyError:\n",
     "            files = \"\\n\".join(f\" * {f}\" for f in tf.getnames() if f.lower().endswith(\".csv\"))\n",
@@ -1185,21 +1188,21 @@
     "        )\n",
     "\n",
     "    with tempfile.TemporaryDirectory() as tmpdirname:\n",
-    "        if str(archive).startswith(\"http\"):\n",
+    "        if isinstance(archive, str) and archive.startswith(\"http\"):\n",
     "            zip_path = Path(tmpdirname) / archive.split(\"?\")[0].rpartition(\"/\")[-1]\n",
     "            download(archive, dest=zip_path)\n",
     "        else:\n",
     "            zip_path = Path(archive)\n",
     "\n",
-    "    extensions = \"\".join(zip_path.suffixes[-2:]).lower()\n",
-    "    if extensions == \".zip\":\n",
-    "        return _zip_csv_2_df(zip_path, csv_path)\n",
-    "    elif extensions in (\".tar\", \".tar.gz\"):\n",
-    "        return _tar_csv_2_df(zip_path, csv_path)\n",
-    "    else:\n",
-    "        raise AttributeError(\n",
-    "            f'Archive shall be either .zip, .tar, or .tar.gz but is \"{zip_path}\"'\n",
-    "        )\n"
+    "        extensions = \"\".join(zip_path.suffixes[-2:]).lower()\n",
+    "        if extensions == \".zip\":\n",
+    "            return _zip_csv_2_df(zip_path, csv_path)\n",
+    "        elif extensions in (\".tar\", \".tar.gz\"):\n",
+    "            return _tar_csv_2_df(zip_path, csv_path)\n",
+    "        else:\n",
+    "            raise AttributeError(\n",
+    "                f'Archive shall be either .zip, .tar, or .tar.gz but is \"{zip_path}\"'\n",
+    "            )"
    ]
   },
   {
@@ -1652,13 +1655,24 @@
     "    ],\n",
     "    ids=[\"flat\", \"folder\", \"subfolder\"],\n",
     ")\n",
-    "def test_read_csv_from_zip_local(archive, csv, check_connection_github):\n",
-    "    \"\"\"Test reading CSV from a URL zip with read_csv_from_zip\"\"\"\n",
+    "def test_read_csv_from_zip_github(archive, csv, check_connection_github):\n",
+    "    \"\"\"Test reading CSV from a URL zip in GitHub with read_csv_from_zip\"\"\"\n",
     "    df = read_csv_from_zip(archive, csv)\n",
     "    assert isinstance(df, pd.DataFrame), f\"Result is not a DataFrame: {df}\"\n",
     "    assert len(df) == 100\n",
     "\n",
     "\n",
+    "def test_read_csv_from_zip_url():\n",
+    "    \"\"\"Test reading CSV from a URL zip with read_csv_from_zip\"\"\"\n",
+    "    url_ar = (\n",
+    "        r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/\"\n",
+    "        \"avianHabitat_sewardPeninsula_McNew_2012.zip\"\n",
+    "    )\n",
+    "    df = read_csv_from_zip(url_ar, \"avianHabitat_sewardPeninsula_McNew_2012.csv\")\n",
+    "    assert isinstance(df, pd.DataFrame), f\"Result is not a DataFrame: {df}\"\n",
+    "    assert len(df) == 1070\n",
+    "\n",
+    "\n",
     "@pytest.mark.parametrize(\n",
     "    \"archive,csv,error\",\n",
     "    [\n",
@@ -1677,7 +1691,7 @@
     "        \"csv missing (tar.gz)\",\n",
     "        \"not csv (extension)\",\n",
     "        \"not csv (no extension)\",\n",
-    "        \"not archive\"\n",
+    "        \"not archive\",\n",
     "    ],\n",
     ")\n",
     "def test_read_csv_from_zip_robustness(archive, csv, error):\n",
@@ -1741,7 +1755,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.6 64-bit",
+   "display_name": "Python 3.7.9 64-bit",
    "name": "python3"
   }
  },

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -325,7 +325,10 @@ def test_url_2_text(check_connection_github):
 
 
 # Test Cell
-url_ar = r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/avianHabitat_sewardPeninsula_McNew_2012.zip"
+url_ar = (
+    r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/"
+    "avianHabitat_sewardPeninsula_McNew_2012.zip"
+)
 
 
 @pytest.mark.parametrize("url", [url_ar, url_ar + "?x=123"], ids=["url", "url?x=y"])
@@ -387,13 +390,24 @@ def test_read_csv_from_zip_local(archive, csv):
     ],
     ids=["flat", "folder", "subfolder"],
 )
-def test_read_csv_from_zip_local(archive, csv, check_connection_github):
-    """Test reading CSV from a URL zip with read_csv_from_zip"""
+def test_read_csv_from_zip_github(archive, csv, check_connection_github):
+    """Test reading CSV from a URL zip in GitHub with read_csv_from_zip"""
     df = read_csv_from_zip(archive, csv)
     assert isinstance(df, pd.DataFrame), f"Result is not a DataFrame: {df}"
     assert len(df) == 100
 
 
+def test_read_csv_from_zip_url():
+    """Test reading CSV from a URL zip with read_csv_from_zip"""
+    url_ar = (
+        r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/"
+        "avianHabitat_sewardPeninsula_McNew_2012.zip"
+    )
+    df = read_csv_from_zip(url_ar, "avianHabitat_sewardPeninsula_McNew_2012.csv")
+    assert isinstance(df, pd.DataFrame), f"Result is not a DataFrame: {df}"
+    assert len(df) == 1070
+
+
 @pytest.mark.parametrize(
     "archive,csv,error",
     [

diff --git a/unpackai/utils.py b/unpackai/utils.py
@@ -332,7 +332,7 @@ def _zip_csv_2_df(zip_path:Path, csv_path:PathStr) -> pd.DataFrame:
     """CSV in zip to DataFrame"""
     with ZipFile(zip_path) as zf:
         try:
-            with zf.open(csv_path) as f_csv:
+            with zf.open(str(csv_path)) as f_csv:
                 return pd.read_csv(f_csv)
         except KeyError:
             files = "\n".join(f" * {f}" for f in zf.namelist() if f.lower().endswith(".csv"))
@@ -346,7 +346,7 @@ def _tar_csv_2_df(tar_path:Path, csv_path:PathStr) -> pd.DataFrame:
     """CSV in tar to DataFrame"""
     with tarfile.open(tar_path) as tf:
         try:
-            csv_member = tf.getmember(csv_path)
+            csv_member = tf.getmember(str(csv_path))
             return pd.read_csv(tf.extractfile(member=csv_member))
         except KeyError:
             files = "\n".join(f" * {f}" for f in tf.getnames() if f.lower().endswith(".csv"))
@@ -371,22 +371,21 @@ def read_csv_from_zip(archive: PathURL, csv_path: PathStr) -> pd.DataFrame:
         )
 
     with tempfile.TemporaryDirectory() as tmpdirname:
-        if str(archive).startswith("http"):
+        if isinstance(archive, str) and archive.startswith("http"):
             zip_path = Path(tmpdirname) / archive.split("?")[0].rpartition("/")[-1]
             download(archive, dest=zip_path)
         else:
             zip_path = Path(archive)
 
-    extensions = "".join(zip_path.suffixes[-2:]).lower()
-    if extensions == ".zip":
-        return _zip_csv_2_df(zip_path, csv_path)
-    elif extensions in (".tar", ".tar.gz"):
-        return _tar_csv_2_df(zip_path, csv_path)
-    else:
-        raise AttributeError(
-            f'Archive shall be either .zip, .tar, or .tar.gz but is "{zip_path}"'
-        )
-
+        extensions = "".join(zip_path.suffixes[-2:]).lower()
+        if extensions == ".zip":
+            return _zip_csv_2_df(zip_path, csv_path)
+        elif extensions in (".tar", ".tar.gz"):
+            return _tar_csv_2_df(zip_path, csv_path)
+        else:
+            raise AttributeError(
+                f'Archive shall be either .zip, .tar, or .tar.gz but is "{zip_path}"'
+            )
 
 # Cell
 try: