feat: allow download in bytes

Iotic-Labs · Sep 25, 2024 · 05713a0 · 05713a0
1 parent d957bee
commit 05713a0
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 30 deletions.
diff --git a/README.sdk.md b/README.sdk.md
@@ -112,7 +112,7 @@ client = NyxClient()
 subscribed_data = client.get_subscribed_data()
 for data in subscribed_data:
   print(f"Downloading data {data.name}")
-  content = data.download()
+  content = data.as_string() # Note if binary file use as_bytes to get content as bytes
 ```
 
 ## 👉 Gotchas

diff --git a/examples/advanced/evaluate/evaluate.py b/examples/advanced/evaluate/evaluate.py
@@ -45,7 +45,7 @@ def __init__(self):
         self.data = client_openai_sm.get_subscribed_data()
         doc_str = ""
         for d in self.data:
-            contents = d.download()
+            contents = d.as_string()
             if contents:
                 doc_str += f"\n\n{d.name}: \n\n {contents}"
 

diff --git a/nyx_client/data.py b/nyx_client/data.py
@@ -111,29 +111,52 @@ def __init__(self, **kwargs):
     def __str__(self):
         return f"Data({self._title}, {self.url}, {self._content_type})"
 
-    def download(self) -> str | None:
-        """Download the content of the data and populate the class content field.
+    def as_string(self) -> str | None:
+        """Download the content of the data as as string.
 
-        This method attempts to download the content from the data's URL
-        and stores it in the `content` attribute.
+        This method attempts to download the content from the data's URL.
 
         Returns:
             The downloaded content as decoded text or None, if the download fails.
+        """
+        try:
+            rsp = requests.get(self.url)
+            rsp.raise_for_status()
+            return rsp.text
+        except requests.RequestException as ex:
+            log.warning(
+                "Failed to download content of data [%s], "
+                "confirm the source is still available with the data producer: %s",
+                self._title,
+                ex,
+            )
 
-        Note:
-            If the content has already been downloaded, this method returns the cached content without re-downloading.
+    def as_bytes(self) -> bytes | None:
+        """Download the content of the data as as bytes.
+
+        This method attempts to download the content from the data's URL.
+
+        Returns:
+            The downloaded content as bytes or None, if the download fails.
         """
-        if not self._content:
-            try:
-                rsp = requests.get(self.url)
-                rsp.raise_for_status()
-                self._content = rsp.text
-            except requests.RequestException as ex:
-                log.warning(
-                    "Failed to download content of data [%s], "
-                    "confirm the source is still available with the data producer: %s",
-                    self._title,
-                    ex,
-                )
+        try:
+            rsp = requests.get(self.url)
+            rsp.raise_for_status()
+            return rsp.content
+        except requests.RequestException as ex:
+            log.warning(
+                "Failed to download content of data [%s], "
+                "confirm the source is still available with the data producer: %s",
+                self._title,
+                ex,
+            )
 
-        return self._content
+    def download(self) -> str | None:
+        """DEPRICATED: Download the content of the data as as string.
+
+        This method attempts to download the content from the data's URL.
+
+        Returns:
+            The downloaded content as string or None, if the download fails.
+        """
+        return self.as_string()
diff --git a/nyx_client/utils.py b/nyx_client/utils.py
@@ -17,7 +17,7 @@
 import logging
 import os
 import sqlite3
-from io import StringIO
+from io import BytesIO
 from typing import Any, List, Literal, Optional
 
 import pandas as pd
@@ -177,17 +177,17 @@ def data_as_db(
         tables = []
         for d in data:
             table_name = Parser.normalise_values([d.title])[0]
-            content = d.download()
+            content = d.as_bytes()
             if content is None:
                 log.debug("Not adding table for %s as no content was found", d.title)
                 continue
             try:
                 if d.content_type == "csv":
-                    content = pd.read_csv(StringIO(content), on_bad_lines="skip")
+                    content = pd.read_csv(BytesIO(content), on_bad_lines="skip")
                 elif d.content_type in Parser._excel_mimes:
-                    content = pd.read_excel(StringIO(content))
+                    content = pd.read_excel(BytesIO(content))
                 elif d.content_type == "json":
-                    content = pd.read_json(StringIO(content))
+                    content = pd.read_json(BytesIO(content))
                 else:
                     log.warning("%s is unsupported type %s", d.title, d.content_type)
                     continue
@@ -259,7 +259,8 @@ def data_as_vectors(self, data: List[Data], chunk_size: int = 1000):
 
         for d in data:
             if d.content_type != "csv":
-                content = d.download()
+                # Only vectorize plain text formats
+                content = d.as_string()
                 if content:
                     chunks = self._chunk_text(content, chunk_size)
                     all_chunks.extend(chunks)

diff --git a/test/test_data.py b/test/test_data.py
@@ -60,7 +60,7 @@ def test_data_download(requests_mock, mock_data_details):
 
     requests_mock.get(data.url, text="Test Content")
 
-    content = data.download()
+    content = data.as_string()
     assert content == "Test Content"
     assert data._content == "Test Content"
 
@@ -70,7 +70,7 @@ def test_data_download_cached(mocker, mock_data_details):
 
     data = Data(**mock_data_details)
     data._content = "Cached Content"
-    content = data.download()
+    content = data.as_string()
     assert content == "Cached Content"
     mock_urlopen.assert_not_called()
 
@@ -80,6 +80,6 @@ def test_nyx_data_download_failure(requests_mock, mock_data_details):
 
     requests_mock.get(data.url, exc=requests.exceptions.ConnectTimeout)
 
-    content = data.download()
+    content = data.as_string()
     assert content is None
     assert data._content is None