From f7afc7380230bbdcce9f706cd5d6d04834d676aa Mon Sep 17 00:00:00 2001 From: Jeff Date: Sat, 23 Oct 2021 08:57:44 +0800 Subject: [PATCH] #38: corrected bug blocking csv2df from url --- nbs/10_utils.ipynb | 48 ++++++++++++++++++++++++++++++---------------- test/test_utils.py | 20 ++++++++++++++++--- unpackai/utils.py | 25 ++++++++++++------------ 3 files changed, 60 insertions(+), 33 deletions(-) diff --git a/nbs/10_utils.ipynb b/nbs/10_utils.ipynb index 4b509e2..d037d21 100644 --- a/nbs/10_utils.ipynb +++ b/nbs/10_utils.ipynb @@ -1096,7 +1096,10 @@ "outputs": [], "source": [ "# exportest\n", - "url_ar = r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/avianHabitat_sewardPeninsula_McNew_2012.zip\"\n", + "url_ar = (\n", + " r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/\"\n", + " \"avianHabitat_sewardPeninsula_McNew_2012.zip\"\n", + ")\n", "\n", "\n", "@pytest.mark.parametrize(\"url\", [url_ar, url_ar + \"?x=123\"], ids=[\"url\", \"url?x=y\"])\n", @@ -1140,7 +1143,7 @@ " \"\"\"CSV in zip to DataFrame\"\"\"\n", " with ZipFile(zip_path) as zf:\n", " try:\n", - " with zf.open(csv_path) as f_csv:\n", + " with zf.open(str(csv_path)) as f_csv:\n", " return pd.read_csv(f_csv)\n", " except KeyError:\n", " files = \"\\n\".join(f\" * {f}\" for f in zf.namelist() if f.lower().endswith(\".csv\"))\n", @@ -1154,7 +1157,7 @@ " \"\"\"CSV in tar to DataFrame\"\"\"\n", " with tarfile.open(tar_path) as tf:\n", " try:\n", - " csv_member = tf.getmember(csv_path)\n", + " csv_member = tf.getmember(str(csv_path))\n", " return pd.read_csv(tf.extractfile(member=csv_member))\n", " except KeyError:\n", " files = \"\\n\".join(f\" * {f}\" for f in tf.getnames() if f.lower().endswith(\".csv\"))\n", @@ -1185,21 +1188,21 @@ " )\n", "\n", " with tempfile.TemporaryDirectory() as tmpdirname:\n", - " if str(archive).startswith(\"http\"):\n", + " if isinstance(archive, str) and archive.startswith(\"http\"):\n", " zip_path = Path(tmpdirname) / archive.split(\"?\")[0].rpartition(\"/\")[-1]\n", " download(archive, dest=zip_path)\n", " else:\n", " zip_path = Path(archive)\n", "\n", - " extensions = \"\".join(zip_path.suffixes[-2:]).lower()\n", - " if extensions == \".zip\":\n", - " return _zip_csv_2_df(zip_path, csv_path)\n", - " elif extensions in (\".tar\", \".tar.gz\"):\n", - " return _tar_csv_2_df(zip_path, csv_path)\n", - " else:\n", - " raise AttributeError(\n", - " f'Archive shall be either .zip, .tar, or .tar.gz but is \"{zip_path}\"'\n", - " )\n" + " extensions = \"\".join(zip_path.suffixes[-2:]).lower()\n", + " if extensions == \".zip\":\n", + " return _zip_csv_2_df(zip_path, csv_path)\n", + " elif extensions in (\".tar\", \".tar.gz\"):\n", + " return _tar_csv_2_df(zip_path, csv_path)\n", + " else:\n", + " raise AttributeError(\n", + " f'Archive shall be either .zip, .tar, or .tar.gz but is \"{zip_path}\"'\n", + " )" ] }, { @@ -1652,13 +1655,24 @@ " ],\n", " ids=[\"flat\", \"folder\", \"subfolder\"],\n", ")\n", - "def test_read_csv_from_zip_local(archive, csv, check_connection_github):\n", - " \"\"\"Test reading CSV from a URL zip with read_csv_from_zip\"\"\"\n", + "def test_read_csv_from_zip_github(archive, csv, check_connection_github):\n", + " \"\"\"Test reading CSV from a URL zip in GitHub with read_csv_from_zip\"\"\"\n", " df = read_csv_from_zip(archive, csv)\n", " assert isinstance(df, pd.DataFrame), f\"Result is not a DataFrame: {df}\"\n", " assert len(df) == 100\n", "\n", "\n", + "def test_read_csv_from_zip_url():\n", + " \"\"\"Test reading CSV from a URL zip with read_csv_from_zip\"\"\"\n", + " url_ar = (\n", + " r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/\"\n", + " \"avianHabitat_sewardPeninsula_McNew_2012.zip\"\n", + " )\n", + " df = read_csv_from_zip(url_ar, \"avianHabitat_sewardPeninsula_McNew_2012.csv\")\n", + " assert isinstance(df, pd.DataFrame), f\"Result is not a DataFrame: {df}\"\n", + " assert len(df) == 1070\n", + "\n", + "\n", "@pytest.mark.parametrize(\n", " \"archive,csv,error\",\n", " [\n", @@ -1677,7 +1691,7 @@ " \"csv missing (tar.gz)\",\n", " \"not csv (extension)\",\n", " \"not csv (no extension)\",\n", - " \"not archive\"\n", + " \"not archive\",\n", " ],\n", ")\n", "def test_read_csv_from_zip_robustness(archive, csv, error):\n", @@ -1741,7 +1755,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.6 64-bit", + "display_name": "Python 3.7.9 64-bit", "name": "python3" } }, diff --git a/test/test_utils.py b/test/test_utils.py index 8ca7f07..3597a5f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -325,7 +325,10 @@ def test_url_2_text(check_connection_github): # Test Cell -url_ar = r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/avianHabitat_sewardPeninsula_McNew_2012.zip" +url_ar = ( + r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/" + "avianHabitat_sewardPeninsula_McNew_2012.zip" +) @pytest.mark.parametrize("url", [url_ar, url_ar + "?x=123"], ids=["url", "url?x=y"]) @@ -387,13 +390,24 @@ def test_read_csv_from_zip_local(archive, csv): ], ids=["flat", "folder", "subfolder"], ) -def test_read_csv_from_zip_local(archive, csv, check_connection_github): - """Test reading CSV from a URL zip with read_csv_from_zip""" +def test_read_csv_from_zip_github(archive, csv, check_connection_github): + """Test reading CSV from a URL zip in GitHub with read_csv_from_zip""" df = read_csv_from_zip(archive, csv) assert isinstance(df, pd.DataFrame), f"Result is not a DataFrame: {df}" assert len(df) == 100 +def test_read_csv_from_zip_url(): + """Test reading CSV from a URL zip with read_csv_from_zip""" + url_ar = ( + r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/" + "avianHabitat_sewardPeninsula_McNew_2012.zip" + ) + df = read_csv_from_zip(url_ar, "avianHabitat_sewardPeninsula_McNew_2012.csv") + assert isinstance(df, pd.DataFrame), f"Result is not a DataFrame: {df}" + assert len(df) == 1070 + + @pytest.mark.parametrize( "archive,csv,error", [ diff --git a/unpackai/utils.py b/unpackai/utils.py index d2f456f..f527dea 100644 --- a/unpackai/utils.py +++ b/unpackai/utils.py @@ -332,7 +332,7 @@ def _zip_csv_2_df(zip_path:Path, csv_path:PathStr) -> pd.DataFrame: """CSV in zip to DataFrame""" with ZipFile(zip_path) as zf: try: - with zf.open(csv_path) as f_csv: + with zf.open(str(csv_path)) as f_csv: return pd.read_csv(f_csv) except KeyError: files = "\n".join(f" * {f}" for f in zf.namelist() if f.lower().endswith(".csv")) @@ -346,7 +346,7 @@ def _tar_csv_2_df(tar_path:Path, csv_path:PathStr) -> pd.DataFrame: """CSV in tar to DataFrame""" with tarfile.open(tar_path) as tf: try: - csv_member = tf.getmember(csv_path) + csv_member = tf.getmember(str(csv_path)) return pd.read_csv(tf.extractfile(member=csv_member)) except KeyError: files = "\n".join(f" * {f}" for f in tf.getnames() if f.lower().endswith(".csv")) @@ -371,22 +371,21 @@ def read_csv_from_zip(archive: PathURL, csv_path: PathStr) -> pd.DataFrame: ) with tempfile.TemporaryDirectory() as tmpdirname: - if str(archive).startswith("http"): + if isinstance(archive, str) and archive.startswith("http"): zip_path = Path(tmpdirname) / archive.split("?")[0].rpartition("/")[-1] download(archive, dest=zip_path) else: zip_path = Path(archive) - extensions = "".join(zip_path.suffixes[-2:]).lower() - if extensions == ".zip": - return _zip_csv_2_df(zip_path, csv_path) - elif extensions in (".tar", ".tar.gz"): - return _tar_csv_2_df(zip_path, csv_path) - else: - raise AttributeError( - f'Archive shall be either .zip, .tar, or .tar.gz but is "{zip_path}"' - ) - + extensions = "".join(zip_path.suffixes[-2:]).lower() + if extensions == ".zip": + return _zip_csv_2_df(zip_path, csv_path) + elif extensions in (".tar", ".tar.gz"): + return _tar_csv_2_df(zip_path, csv_path) + else: + raise AttributeError( + f'Archive shall be either .zip, .tar, or .tar.gz but is "{zip_path}"' + ) # Cell try: