Skip to content

Commit

Permalink
feat: allow download in bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
andy blair authored and andyantrim committed Sep 25, 2024
1 parent d957bee commit 05713a0
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 30 deletions.
2 changes: 1 addition & 1 deletion README.sdk.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ client = NyxClient()
subscribed_data = client.get_subscribed_data()
for data in subscribed_data:
print(f"Downloading data {data.name}")
content = data.download()
content = data.as_string() # Note if binary file use as_bytes to get content as bytes
```

## 👉 Gotchas
Expand Down
2 changes: 1 addition & 1 deletion examples/advanced/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self):
self.data = client_openai_sm.get_subscribed_data()
doc_str = ""
for d in self.data:
contents = d.download()
contents = d.as_string()
if contents:
doc_str += f"\n\n{d.name}: \n\n {contents}"

Expand Down
61 changes: 42 additions & 19 deletions nyx_client/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,29 +111,52 @@ def __init__(self, **kwargs):
def __str__(self):
return f"Data({self._title}, {self.url}, {self._content_type})"

def download(self) -> str | None:
"""Download the content of the data and populate the class content field.
def as_string(self) -> str | None:
"""Download the content of the data as as string.
This method attempts to download the content from the data's URL
and stores it in the `content` attribute.
This method attempts to download the content from the data's URL.
Returns:
The downloaded content as decoded text or None, if the download fails.
"""
try:
rsp = requests.get(self.url)
rsp.raise_for_status()
return rsp.text
except requests.RequestException as ex:
log.warning(
"Failed to download content of data [%s], "
"confirm the source is still available with the data producer: %s",
self._title,
ex,
)

Note:
If the content has already been downloaded, this method returns the cached content without re-downloading.
def as_bytes(self) -> bytes | None:
"""Download the content of the data as as bytes.
This method attempts to download the content from the data's URL.
Returns:
The downloaded content as bytes or None, if the download fails.
"""
if not self._content:
try:
rsp = requests.get(self.url)
rsp.raise_for_status()
self._content = rsp.text
except requests.RequestException as ex:
log.warning(
"Failed to download content of data [%s], "
"confirm the source is still available with the data producer: %s",
self._title,
ex,
)
try:
rsp = requests.get(self.url)
rsp.raise_for_status()
return rsp.content
except requests.RequestException as ex:
log.warning(
"Failed to download content of data [%s], "
"confirm the source is still available with the data producer: %s",
self._title,
ex,
)

return self._content
def download(self) -> str | None:
"""DEPRICATED: Download the content of the data as as string.
This method attempts to download the content from the data's URL.
Returns:
The downloaded content as string or None, if the download fails.
"""
return self.as_string()
13 changes: 7 additions & 6 deletions nyx_client/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import logging
import os
import sqlite3
from io import StringIO
from io import BytesIO
from typing import Any, List, Literal, Optional

import pandas as pd
Expand Down Expand Up @@ -177,17 +177,17 @@ def data_as_db(
tables = []
for d in data:
table_name = Parser.normalise_values([d.title])[0]
content = d.download()
content = d.as_bytes()
if content is None:
log.debug("Not adding table for %s as no content was found", d.title)
continue
try:
if d.content_type == "csv":
content = pd.read_csv(StringIO(content), on_bad_lines="skip")
content = pd.read_csv(BytesIO(content), on_bad_lines="skip")
elif d.content_type in Parser._excel_mimes:
content = pd.read_excel(StringIO(content))
content = pd.read_excel(BytesIO(content))
elif d.content_type == "json":
content = pd.read_json(StringIO(content))
content = pd.read_json(BytesIO(content))
else:
log.warning("%s is unsupported type %s", d.title, d.content_type)
continue
Expand Down Expand Up @@ -259,7 +259,8 @@ def data_as_vectors(self, data: List[Data], chunk_size: int = 1000):

for d in data:
if d.content_type != "csv":
content = d.download()
# Only vectorize plain text formats
content = d.as_string()
if content:
chunks = self._chunk_text(content, chunk_size)
all_chunks.extend(chunks)
Expand Down
6 changes: 3 additions & 3 deletions test/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_data_download(requests_mock, mock_data_details):

requests_mock.get(data.url, text="Test Content")

content = data.download()
content = data.as_string()
assert content == "Test Content"
assert data._content == "Test Content"

Expand All @@ -70,7 +70,7 @@ def test_data_download_cached(mocker, mock_data_details):

data = Data(**mock_data_details)
data._content = "Cached Content"
content = data.download()
content = data.as_string()
assert content == "Cached Content"
mock_urlopen.assert_not_called()

Expand All @@ -80,6 +80,6 @@ def test_nyx_data_download_failure(requests_mock, mock_data_details):

requests_mock.get(data.url, exc=requests.exceptions.ConnectTimeout)

content = data.download()
content = data.as_string()
assert content is None
assert data._content is None

0 comments on commit 05713a0

Please sign in to comment.