Merge pull request #18 from mikecafarella/biofabric

Support for Tables + biofabric demo
mitdbg · Apr 15, 2024 · 9a80b0b · 9a80b0b
2 parents 08e9d46 + 9c455b2
commit 9a80b0b
Show file tree

Hide file tree

Showing 28 changed files with 979 additions and 57 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,8 +12,7 @@ dependencies = [
     "charset-normalizer>=3.3.2",
     "click>=8.1.7",
     "click-aliases>=1.0.4",
-    # updated dspy to make it work with Google Gemini
-    "dspy-ai>=2.4.1", # not available on pypi, install from source
+    "dspy-ai>=2.4.1",
     "fastapi~=0.100.0",
     "google-generativeai==0.4.1",
     "gradio>=4.20.1",
@@ -24,6 +23,7 @@ dependencies = [
     "necessary>=0.3.2",
     "numpy>=1.23.2",
     "openai>=1.0",
+    "openpyxl==3.1.2",
     "pandas~=2.1.1",
     "papermage>=0.16.0",
     "pdf2image",

diff --git a/src/palimpzest/corelib/functions.py b/src/palimpzest/corelib/functions.py
@@ -3,6 +3,7 @@
 from palimpzest.corelib import URL, WebPage, Download
 
 import requests
+from requests_html import HTMLSession # for downloading JavaScript content
 import datetime
 from bs4 import BeautifulSoup
 
@@ -32,11 +33,20 @@ def html_to_text_with_links(self, html):
         text = soup.get_text(separator='\n', strip=True)        
         return text
 
+    def get_page_text(self, url):
+        headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
+        }
+
+        session = HTMLSession()
+        response = session.get(url, headers=headers)
+        return response.text
+
     # Someday we should introduce an abstraction that lets us
     # coalesce many requests into a bulk operation. This code is
     # fine for a dozen requests, but not for a million.
     def map(self, dr: DataRecord):
-        textcontent = requests.get(dr.url).text
+        textcontent = self.get_page_text(dr.url)
         dr2 = DataRecord(self.outputSchema, parent_uuid=dr._uuid)
         dr2.url = dr.url
 

diff --git a/src/palimpzest/corelib/schemas.py b/src/palimpzest/corelib/schemas.py
@@ -1,4 +1,4 @@
-from palimpzest.elements import BytesField, Schema, StringField, File
+from palimpzest.elements import BytesField, Schema, StringField, File, NumericField, ListField
 
 ###################################################################################
 # "Core" useful Schemas. These are Schemas that almost everyone will need.
@@ -36,4 +36,19 @@ class WebPage(Schema):
     url = StringField(desc="The URL of the web page", required=True)
     text = StringField(desc="The text contents of the web page", required=True)
     html = StringField(desc="The html contents of the web page", required=True)
-    timestamp = StringField(desc="The timestamp of the download", required=True)
+    timestamp = StringField(desc="The timestamp of the download", required=True)
+
+class XLSFile(File):
+    """An XLS file is a File that contains one or more Excel spreadsheets."""
+    number_sheets = NumericField(desc="The number of sheets in the Excel file", required=True)
+    sheet_names = ListField(element_type=NumericField, desc="The names of the sheets in the Excel file", required=True)
+
+class TabularRow(Schema):
+    """A Row is a list of cells. For simplicity, we assume that all cell values are strings."""
+    cells = ListField(element_type=StringField, desc="The cells in the row", required=True)
+
+class Table(Schema):
+    """A Table is an object composed of a header and rows."""
+    name = StringField(desc="The name of the table", required=False)
+    header = ListField(element_type=StringField, desc="The header of the table", required=True)
+    rows = ListField(element_type=TabularRow, desc="The rows of the table", required=True)
diff --git a/src/palimpzest/elements/__init__.py b/src/palimpzest/elements/__init__.py
@@ -3,4 +3,5 @@
 from .aggregatefunction import *
 from .records import *
 from .functions import *
-from .groupbysig import *
+from .groupbysig import *
+from .pzlist import *
diff --git a/src/palimpzest/elements/pzlist.py b/src/palimpzest/elements/pzlist.py
@@ -1,4 +1,4 @@
-from palimpzest import Field
+from palimpzest.elements import Field
 
 class ListField(Field, list):
     """A field representing a list of elements of specified types, with full list functionality."""

diff --git a/src/palimpzest/elements/records.py b/src/palimpzest/elements/records.py
@@ -68,7 +68,7 @@ def asJSON(self):
 
     def __str__(self):
         keys = sorted(self.__dict__)
-        items = ("{}={!r}".format(k, str(self.__dict__[k])[:15]) for k in keys)
+        items = ("{}={!r}...".format(k, str(self.__dict__[k])[:15]) for k in keys)
         return "{}({})".format(type(self).__name__, ", ".join(items))
 
     def __eq__(self, other):

diff --git a/src/palimpzest/execution/__init__.py b/src/palimpzest/execution/__init__.py
@@ -1,2 +1 @@
-
-from .execution import Execution
+from .execution import Execution, SimpleExecution
diff --git a/src/palimpzest/execution/execution.py b/src/palimpzest/execution/execution.py
@@ -2,6 +2,7 @@
 from palimpzest.policy import Policy, MaxQuality, UserChoice
 from palimpzest.profiler import StatsProcessor
 from palimpzest.datamanager import DataDirectory
+import itertools 
 
 def emitNestedTuple(node, indent=0):
     elt, child = node
@@ -10,6 +11,43 @@ def emitNestedTuple(node, indent=0):
         emitNestedTuple(child, indent=indent+2)
 
 
+def flatten_nested_tuples(nested_tuples):
+    """
+    This function takes a nested iterable of the form (4,(3,(2,(1,())))) and flattens it to (1, 2, 3, 4).
+    """
+    result = []
+    def flatten(item):
+        if isinstance(item, tuple):
+            if item:  # Check if not an empty list
+                flatten(item[0])  # Process the head
+                flatten(item[1])  # Process the tail
+        else:
+            result.append(item)
+
+    flatten(nested_tuples)
+    result = list(result)
+    result.reverse()
+    return result[1:]
+
+def graphicEmit(flatten_ops):
+    start = flatten_ops[0]
+    print(f" 0. {type(start).__name__} -> {start.outputSchema.__name__} \n")
+
+    for idx, (left, right) in enumerate(itertools.pairwise(flatten_ops)):
+        in_schema = left.outputSchema
+        out_schema = right.outputSchema
+        print(f" {idx+1}. {in_schema.__name__} -> {type(right).__name__} -> {out_schema.__name__} ", end="")
+        # if right.desc is not None:
+        #     print(f" ({right.desc})", end="")
+        # check if right has a model attribute
+        if hasattr(right, 'model'):
+            print(f"\n    Using {right.model}", end="")
+        if hasattr(right, 'filter'):
+            print(f'\n    Filter: "{right.filter.filterCondition}"', end="")
+        print()
+        print(f"    ({','.join(in_schema.fieldNames())[:15]}...) -> ({','.join(out_schema.fieldNames())[:15]}...)")
+        print()
+
 class Execution:
     """An Execution is responsible for completing a query on a given Set.
     Right now we assume the query is always SELECT * FROM set."""
@@ -65,6 +103,9 @@ def executeAndOptimize(self, verbose: bool=False, shouldProfile: bool=False):
         # TODO: remove
         if verbose:
             import json
+            import os
+            if not os.path.exists('profiling-data'):
+                os.makedirs('profiling-data')
             with open('profiling-data/eo-raw_profiling.json', 'w') as f:
                 sp = StatsProcessor(profileData)
                 json.dump(sp.profiling_data.to_dict(), f)
@@ -120,3 +161,42 @@ def executeAndOptimize(self, verbose: bool=False, shouldProfile: bool=False):
         # 3. gather execution data
         # 4. provide all cost estimates to physical operators
 
+class SimpleExecution(Execution):
+    """
+    This simple execution does not pre-sample the data and does not use cache nor profiling.
+    It just runs the query and returns the results.
+    """
+
+    def executeAndOptimize(self, verbose: bool=False, shouldProfile: bool=False):
+        """An execution of the rootset, subject to user-given policy."""
+        logicalTree = self.rootset.getLogicalTree()
+
+        # Ok now reoptimize the logical plan, this time with the sample data.
+        # (The data is not currently being used; let's see if this method can work first)
+        logicalTree = self.rootset.getLogicalTree()
+        candidatePlans = logicalTree.createPhysicalPlanCandidates(shouldProfile=shouldProfile)
+        if type(self.policy) == UserChoice:
+            print("-----AVAILABLE PLANS -----")
+            for idx, cp in enumerate(candidatePlans):
+                print(f"Plan {idx}: Time est: {cp[0]:.3f} -- Cost est: {cp[1]:.3f} -- Quality est: {cp[2]:.3f}")
+                print("Physical operator tree")
+                physicalOps = cp[3].dumpPhysicalTree()
+                emitNestedTuple(physicalOps)
+                print("----------")
+                # flatten_ops = flatten_nested_tuples(physicalOps)               
+                # graphicEmit(flatten_ops)
+                # print("----------")
+
+        planTime, planCost, quality, physicalTree = self.policy.choose(candidatePlans)
+
+        if verbose:
+            print("----------")
+            print(f"Policy is: {self.policy}")
+            print(f"Chosen plan: Time est: {planTime:.3f} -- Cost est: {planCost:.3f} -- Quality est: {quality:.3f}")
+            ops = physicalTree.dumpPhysicalTree()
+            flatten_ops = flatten_nested_tuples(ops)
+            graphicEmit(flatten_ops)
+            # emitNestedTuple(ops)
+
+        return physicalTree
+
diff --git a/src/palimpzest/generators/generators.py b/src/palimpzest/generators/generators.py
@@ -70,14 +70,31 @@ def _get_model(self) -> dsp.LM:
 
         elif self.model_name in [Model.GEMINI_1.value]:
             google_key = get_api_key('GOOGLE_API_KEY')
-            model = dspy.Google(model=self.model_name, api_key=google_key, return_dict=True)
+            model = dspy.Google(model=self.model_name, api_key=google_key)
 
         else:
             raise ValueError("Model must be one of the language models specified in palimpzest.constants.Model")
 
         return model
 
-    def _get_attn(dspy_lm: dsp.LM):
+    def _get_usage_and_finish_reason(self, dspy_lm: dsp.LM):
+        """
+        Parse and return the usage statistics and finish reason.
+        """
+        usage, finish_reason = None, None
+        if self.model_name in [Model.GPT_3_5.value, Model.GPT_4.value]:
+            usage = dspy_lm.history[-1]['response']['usage']
+            finish_reason = dspy_lm.history[-1]['response']['choices'][-1]['finish_reason']
+        elif self.model_name in [Model.GEMINI_1.value]:
+            usage = {}
+            finish_reason = dspy_lm.history[-1]['response'][0]._result.candidates[0].finish_reason
+        elif self.model_name in [Model.MIXTRAL.value]:
+            usage = dspy_lm.history[-1]['response']['usage']
+            finish_reason = dspy_lm.history[-1]['response']['finish_reason']
+
+        return usage, finish_reason
+
+    def _get_attn(self, dspy_lm: dsp.LM):
         """
         TODO
         """
@@ -92,11 +109,19 @@ def _get_answer_log_probs(self, dspy_lm: dsp.LM, answer: str) -> List[float]:
         """
         # get log probabilities data structure
         tokens, token_logprobs = None, None
-        if self.model_name in [Model.GPT_3_5.value, Model.GPT_4.value, Model.GEMINI_1.value]:
+
+        if self.model_name in [Model.GPT_3_5.value, Model.GPT_4.value]:
             # [{'token': 'some', 'bytes': [12, 34, ...], 'logprob': -0.7198808, 'top_logprobs': []}}]
             log_probs = dspy_lm.history[-1]['response']['choices'][-1]['logprobs']['content']
             tokens = list(map(lambda elt: elt['token'], log_probs))
             token_logprobs = list(map(lambda elt: elt['logprob'], log_probs))
+        elif self.model_name in [Model.GEMINI_1.value]:
+            return None
+            # TODO Google gemini does not provide log probabilities! 
+            # https://github.com/google/generative-ai-python/issues/238
+            # tok_count = dspy_lm.llm.count_tokens(answer).total_tokens
+            # tokens = [""] * tok_count
+            # token_logprobs = [0] * len(tokens)
         elif self.model_name in [Model.MIXTRAL.value]:
             # reponse: dict_keys(['prompt', 'choices', 'usage', 'finish_reason', 'tokens', 'token_logprobs'])
             tokens = dspy_lm.history[-1]['response']['tokens']
@@ -105,24 +130,24 @@ def _get_answer_log_probs(self, dspy_lm: dsp.LM, answer: str) -> List[float]:
             raise ValueError("Model must be one of the language models specified in palimpzest.constants.Model")
 
         # get indices of the start and end token for the answer
-        start_idx, end_idx = 0, 0
-        while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
-            if answer.startswith(tokens[start_idx]):
-                end_idx += 1
-            else:
-                start_idx += 1
-                end_idx = start_idx
-
+        # start_idx, end_idx = 0, 0
+        # while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
+            # if answer.startswith(tokens[start_idx]):
+                # end_idx += 1
+            # else:
+                # start_idx += 1
+                # end_idx = start_idx
         # filter for log probs of tokens which appear in answer
-        answer_log_probs = token_logprobs[start_idx:end_idx+1]
-
+        # answer_log_probs = token_logprobs[start_idx:end_idx+1]
+        answer_log_probs = token_logprobs
         # return those tokens log probabilities
         return answer_log_probs
 
     @retry(
         wait=wait_exponential(multiplier=RETRY_MULTIPLIER, max=RETRY_MAX_SECS),
         stop=stop_after_attempt(RETRY_MAX_ATTEMPTS),
         after=log_attempt_number,
+        reraise=True,
     )
     def generate(self, context: str, question: str) -> GenerationOutput:
         # fetch model
@@ -139,18 +164,15 @@ def generate(self, context: str, question: str) -> GenerationOutput:
 
         # extract the log probabilities for the actual result(s) which are returned
         answer_log_probs = self._get_answer_log_probs(dspy_lm, pred.answer)
+        usage, finish_reason = self._get_usage_and_finish_reason(dspy_lm)
 
         # collect statistics on prompt, usage, and timing
         stats = GenerationStats(
             model_name=self.model_name,
             llm_call_duration_secs=end_time - start_time,
             prompt=dspy_lm.history[-1]['prompt'],
-            usage=dspy_lm.history[-1]['response']['usage'],
-            finish_reason=(
-                dspy_lm.history[-1]['response']['finish_reason']
-                if isinstance(dspy_lm, TogetherHFAdaptor)
-                else dspy_lm.history[-1]['response']['choices'][-1]['finish_reason']
-            ),
+            usage=usage,
+            finish_reason=finish_reason,
             answer_log_probs=answer_log_probs,
             answer=pred.answer,
         )
@@ -255,17 +277,19 @@ def _get_answer_log_probs(self, tokens: List[str], token_logprobs: List[float],
         Filter and return the list of log probabilities for the tokens which appear in `answer`.
         """
         # get indices of the start and end token for the answer
-        start_idx, end_idx = 0, 0
-        while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
-            if answer.startswith(tokens[start_idx]):
-                end_idx += 1
-            else:
-                start_idx += 1
-                end_idx = start_idx
 
-        # filter for log probs of tokens which appear in answer
-        answer_log_probs = token_logprobs[start_idx:end_idx+1]
+        # start_idx, end_idx = 0, 0
+        # while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
+            # if answer.startswith(tokens[start_idx]):
+                # end_idx += 1
+            # else:
+                # start_idx += 1
+                # end_idx = start_idx
 
+        # filter for log probs of tokens which appear in answer
+        # answer_log_probs = token_logprobs[start_idx:end_idx+1]
+        answer_log_probs = token_logprobs
+
         # return those tokens log probabilities
         return answer_log_probs
 

diff --git a/src/palimpzest/operators/operators.py b/src/palimpzest/operators/operators.py
@@ -154,9 +154,8 @@ def _createPhysicalPlans(self, max: int=10, shouldProfile: bool=False) -> Physic
         if os.getenv('TOGETHER_API_KEY') is not None:
             models.extend([Model.MIXTRAL])
 
-        # TODO: uncomment once dspy pushes v2.4.1 to PyPI
-        # if os.getenv('GOOGLE_API_KEY') is not None:
-        #     models.extend([Model.GEMINI_1])
+        if os.getenv('GOOGLE_API_KEY') is not None:
+            models.extend([Model.GEMINI_1])
 
         assert len(models) > 0, "No models available to create physical plans! You must set at least one of the following environment variables: [OPENAI_API_KEY, TOGETHER_API_KEY, GOOGLE_API_KEY]"
 

diff --git a/src/palimpzest/operators/physical.py b/src/palimpzest/operators/physical.py
@@ -244,12 +244,6 @@ def __init__(self, outputSchema: Schema, source: PhysicalOp, model: Model, cardi
                 self.model = random.choice([Model.GPT_4V, Model.GEMINI_1V])
             self.prompt_strategy = PromptStrategy.IMAGE_TO_TEXT
 
-        # TODO: remove once dspy pushes v2.4.1 to PyPI
-        if self.model == Model.GEMINI_1:
-            self.model = Model.GPT_4
-        elif self.model == Model.GEMINI_1V:
-            self.model = Model.GPT_4V
-
         # NOTE: need to construct profiler after all fields used by self.opId() are set
         self.profiler = Profiler(op_id=self.opId())
         self.profile = self.profiler.iter_profiler
@@ -446,12 +440,6 @@ def __init__(self, outputSchema: Schema, source: PhysicalOp, model: Model, cardi
                 self.model = random.choice([Model.GPT_4V, Model.GEMINI_1V])
             self.prompt_strategy = PromptStrategy.IMAGE_TO_TEXT
 
-        # TODO: remove once dspy pushes v2.4.1 to PyPI
-        if self.model == Model.GEMINI_1:
-            self.model = Model.GPT_4
-        elif self.model == Model.GEMINI_1V:
-            self.model = Model.GPT_4V
-
         # NOTE: need to construct profiler after all fields used by self.opId() are set
         self.profiler = Profiler(op_id=self.opId())
         self.profile = self.profiler.iter_profiler