Skip to content

Commit

Permalink
#38: corrected bug blocking csv2df from url
Browse files Browse the repository at this point in the history
  • Loading branch information
jfthuong committed Oct 23, 2021
1 parent f0de989 commit f7afc73
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 33 deletions.
48 changes: 31 additions & 17 deletions nbs/10_utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,10 @@
"outputs": [],
"source": [
"# exportest\n",
"url_ar = r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/avianHabitat_sewardPeninsula_McNew_2012.zip\"\n",
"url_ar = (\n",
" r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/\"\n",
" \"avianHabitat_sewardPeninsula_McNew_2012.zip\"\n",
")\n",
"\n",
"\n",
"@pytest.mark.parametrize(\"url\", [url_ar, url_ar + \"?x=123\"], ids=[\"url\", \"url?x=y\"])\n",
Expand Down Expand Up @@ -1140,7 +1143,7 @@
" \"\"\"CSV in zip to DataFrame\"\"\"\n",
" with ZipFile(zip_path) as zf:\n",
" try:\n",
" with zf.open(csv_path) as f_csv:\n",
" with zf.open(str(csv_path)) as f_csv:\n",
" return pd.read_csv(f_csv)\n",
" except KeyError:\n",
" files = \"\\n\".join(f\" * {f}\" for f in zf.namelist() if f.lower().endswith(\".csv\"))\n",
Expand All @@ -1154,7 +1157,7 @@
" \"\"\"CSV in tar to DataFrame\"\"\"\n",
" with tarfile.open(tar_path) as tf:\n",
" try:\n",
" csv_member = tf.getmember(csv_path)\n",
" csv_member = tf.getmember(str(csv_path))\n",
" return pd.read_csv(tf.extractfile(member=csv_member))\n",
" except KeyError:\n",
" files = \"\\n\".join(f\" * {f}\" for f in tf.getnames() if f.lower().endswith(\".csv\"))\n",
Expand Down Expand Up @@ -1185,21 +1188,21 @@
" )\n",
"\n",
" with tempfile.TemporaryDirectory() as tmpdirname:\n",
" if str(archive).startswith(\"http\"):\n",
" if isinstance(archive, str) and archive.startswith(\"http\"):\n",
" zip_path = Path(tmpdirname) / archive.split(\"?\")[0].rpartition(\"/\")[-1]\n",
" download(archive, dest=zip_path)\n",
" else:\n",
" zip_path = Path(archive)\n",
"\n",
" extensions = \"\".join(zip_path.suffixes[-2:]).lower()\n",
" if extensions == \".zip\":\n",
" return _zip_csv_2_df(zip_path, csv_path)\n",
" elif extensions in (\".tar\", \".tar.gz\"):\n",
" return _tar_csv_2_df(zip_path, csv_path)\n",
" else:\n",
" raise AttributeError(\n",
" f'Archive shall be either .zip, .tar, or .tar.gz but is \"{zip_path}\"'\n",
" )\n"
" extensions = \"\".join(zip_path.suffixes[-2:]).lower()\n",
" if extensions == \".zip\":\n",
" return _zip_csv_2_df(zip_path, csv_path)\n",
" elif extensions in (\".tar\", \".tar.gz\"):\n",
" return _tar_csv_2_df(zip_path, csv_path)\n",
" else:\n",
" raise AttributeError(\n",
" f'Archive shall be either .zip, .tar, or .tar.gz but is \"{zip_path}\"'\n",
" )"
]
},
{
Expand Down Expand Up @@ -1652,13 +1655,24 @@
" ],\n",
" ids=[\"flat\", \"folder\", \"subfolder\"],\n",
")\n",
"def test_read_csv_from_zip_local(archive, csv, check_connection_github):\n",
" \"\"\"Test reading CSV from a URL zip with read_csv_from_zip\"\"\"\n",
"def test_read_csv_from_zip_github(archive, csv, check_connection_github):\n",
" \"\"\"Test reading CSV from a URL zip in GitHub with read_csv_from_zip\"\"\"\n",
" df = read_csv_from_zip(archive, csv)\n",
" assert isinstance(df, pd.DataFrame), f\"Result is not a DataFrame: {df}\"\n",
" assert len(df) == 100\n",
"\n",
"\n",
"def test_read_csv_from_zip_url():\n",
" \"\"\"Test reading CSV from a URL zip with read_csv_from_zip\"\"\"\n",
" url_ar = (\n",
" r\"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/\"\n",
" \"avianHabitat_sewardPeninsula_McNew_2012.zip\"\n",
" )\n",
" df = read_csv_from_zip(url_ar, \"avianHabitat_sewardPeninsula_McNew_2012.csv\")\n",
" assert isinstance(df, pd.DataFrame), f\"Result is not a DataFrame: {df}\"\n",
" assert len(df) == 1070\n",
"\n",
"\n",
"@pytest.mark.parametrize(\n",
" \"archive,csv,error\",\n",
" [\n",
Expand All @@ -1677,7 +1691,7 @@
" \"csv missing (tar.gz)\",\n",
" \"not csv (extension)\",\n",
" \"not csv (no extension)\",\n",
" \"not archive\"\n",
" \"not archive\",\n",
" ],\n",
")\n",
"def test_read_csv_from_zip_robustness(archive, csv, error):\n",
Expand Down Expand Up @@ -1741,7 +1755,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.6 64-bit",
"display_name": "Python 3.7.9 64-bit",
"name": "python3"
}
},
Expand Down
20 changes: 17 additions & 3 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,10 @@ def test_url_2_text(check_connection_github):


# Test Cell
url_ar = r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/avianHabitat_sewardPeninsula_McNew_2012.zip"
url_ar = (
r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/"
"avianHabitat_sewardPeninsula_McNew_2012.zip"
)


@pytest.mark.parametrize("url", [url_ar, url_ar + "?x=123"], ids=["url", "url?x=y"])
Expand Down Expand Up @@ -387,13 +390,24 @@ def test_read_csv_from_zip_local(archive, csv):
],
ids=["flat", "folder", "subfolder"],
)
def test_read_csv_from_zip_local(archive, csv, check_connection_github):
"""Test reading CSV from a URL zip with read_csv_from_zip"""
def test_read_csv_from_zip_github(archive, csv, check_connection_github):
"""Test reading CSV from a URL zip in GitHub with read_csv_from_zip"""
df = read_csv_from_zip(archive, csv)
assert isinstance(df, pd.DataFrame), f"Result is not a DataFrame: {df}"
assert len(df) == 100


def test_read_csv_from_zip_url():
"""Test reading CSV from a URL zip with read_csv_from_zip"""
url_ar = (
r"https://alaska.usgs.gov/data/landBirds/sewardPeninsula/2012/"
"avianHabitat_sewardPeninsula_McNew_2012.zip"
)
df = read_csv_from_zip(url_ar, "avianHabitat_sewardPeninsula_McNew_2012.csv")
assert isinstance(df, pd.DataFrame), f"Result is not a DataFrame: {df}"
assert len(df) == 1070


@pytest.mark.parametrize(
"archive,csv,error",
[
Expand Down
25 changes: 12 additions & 13 deletions unpackai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def _zip_csv_2_df(zip_path:Path, csv_path:PathStr) -> pd.DataFrame:
"""CSV in zip to DataFrame"""
with ZipFile(zip_path) as zf:
try:
with zf.open(csv_path) as f_csv:
with zf.open(str(csv_path)) as f_csv:
return pd.read_csv(f_csv)
except KeyError:
files = "\n".join(f" * {f}" for f in zf.namelist() if f.lower().endswith(".csv"))
Expand All @@ -346,7 +346,7 @@ def _tar_csv_2_df(tar_path:Path, csv_path:PathStr) -> pd.DataFrame:
"""CSV in tar to DataFrame"""
with tarfile.open(tar_path) as tf:
try:
csv_member = tf.getmember(csv_path)
csv_member = tf.getmember(str(csv_path))
return pd.read_csv(tf.extractfile(member=csv_member))
except KeyError:
files = "\n".join(f" * {f}" for f in tf.getnames() if f.lower().endswith(".csv"))
Expand All @@ -371,22 +371,21 @@ def read_csv_from_zip(archive: PathURL, csv_path: PathStr) -> pd.DataFrame:
)

with tempfile.TemporaryDirectory() as tmpdirname:
if str(archive).startswith("http"):
if isinstance(archive, str) and archive.startswith("http"):
zip_path = Path(tmpdirname) / archive.split("?")[0].rpartition("/")[-1]
download(archive, dest=zip_path)
else:
zip_path = Path(archive)

extensions = "".join(zip_path.suffixes[-2:]).lower()
if extensions == ".zip":
return _zip_csv_2_df(zip_path, csv_path)
elif extensions in (".tar", ".tar.gz"):
return _tar_csv_2_df(zip_path, csv_path)
else:
raise AttributeError(
f'Archive shall be either .zip, .tar, or .tar.gz but is "{zip_path}"'
)

extensions = "".join(zip_path.suffixes[-2:]).lower()
if extensions == ".zip":
return _zip_csv_2_df(zip_path, csv_path)
elif extensions in (".tar", ".tar.gz"):
return _tar_csv_2_df(zip_path, csv_path)
else:
raise AttributeError(
f'Archive shall be either .zip, .tar, or .tar.gz but is "{zip_path}"'
)

# Cell
try:
Expand Down

0 comments on commit f7afc73

Please sign in to comment.