Skip to content

Commit

Permalink
Merge pull request #18 from mikecafarella/biofabric
Browse files Browse the repository at this point in the history
Support for Tables + biofabric demo
  • Loading branch information
vitaglianog authored Apr 15, 2024
2 parents 08e9d46 + 9c455b2 commit 9a80b0b
Show file tree
Hide file tree
Showing 28 changed files with 979 additions and 57 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ dependencies = [
"charset-normalizer>=3.3.2",
"click>=8.1.7",
"click-aliases>=1.0.4",
# updated dspy to make it work with Google Gemini
"dspy-ai>=2.4.1", # not available on pypi, install from source
"dspy-ai>=2.4.1",
"fastapi~=0.100.0",
"google-generativeai==0.4.1",
"gradio>=4.20.1",
Expand All @@ -24,6 +23,7 @@ dependencies = [
"necessary>=0.3.2",
"numpy>=1.23.2",
"openai>=1.0",
"openpyxl==3.1.2",
"pandas~=2.1.1",
"papermage>=0.16.0",
"pdf2image",
Expand Down
12 changes: 11 additions & 1 deletion src/palimpzest/corelib/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from palimpzest.corelib import URL, WebPage, Download

import requests
from requests_html import HTMLSession # for downloading JavaScript content
import datetime
from bs4 import BeautifulSoup

Expand Down Expand Up @@ -32,11 +33,20 @@ def html_to_text_with_links(self, html):
text = soup.get_text(separator='\n', strip=True)
return text

def get_page_text(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

session = HTMLSession()
response = session.get(url, headers=headers)
return response.text

# Someday we should introduce an abstraction that lets us
# coalesce many requests into a bulk operation. This code is
# fine for a dozen requests, but not for a million.
def map(self, dr: DataRecord):
textcontent = requests.get(dr.url).text
textcontent = self.get_page_text(dr.url)
dr2 = DataRecord(self.outputSchema, parent_uuid=dr._uuid)
dr2.url = dr.url

Expand Down
19 changes: 17 additions & 2 deletions src/palimpzest/corelib/schemas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from palimpzest.elements import BytesField, Schema, StringField, File
from palimpzest.elements import BytesField, Schema, StringField, File, NumericField, ListField

###################################################################################
# "Core" useful Schemas. These are Schemas that almost everyone will need.
Expand Down Expand Up @@ -36,4 +36,19 @@ class WebPage(Schema):
url = StringField(desc="The URL of the web page", required=True)
text = StringField(desc="The text contents of the web page", required=True)
html = StringField(desc="The html contents of the web page", required=True)
timestamp = StringField(desc="The timestamp of the download", required=True)
timestamp = StringField(desc="The timestamp of the download", required=True)

class XLSFile(File):
"""An XLS file is a File that contains one or more Excel spreadsheets."""
number_sheets = NumericField(desc="The number of sheets in the Excel file", required=True)
sheet_names = ListField(element_type=NumericField, desc="The names of the sheets in the Excel file", required=True)

class TabularRow(Schema):
"""A Row is a list of cells. For simplicity, we assume that all cell values are strings."""
cells = ListField(element_type=StringField, desc="The cells in the row", required=True)

class Table(Schema):
"""A Table is an object composed of a header and rows."""
name = StringField(desc="The name of the table", required=False)
header = ListField(element_type=StringField, desc="The header of the table", required=True)
rows = ListField(element_type=TabularRow, desc="The rows of the table", required=True)
3 changes: 2 additions & 1 deletion src/palimpzest/elements/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
from .aggregatefunction import *
from .records import *
from .functions import *
from .groupbysig import *
from .groupbysig import *
from .pzlist import *
2 changes: 1 addition & 1 deletion src/palimpzest/elements/pzlist.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from palimpzest import Field
from palimpzest.elements import Field

class ListField(Field, list):
"""A field representing a list of elements of specified types, with full list functionality."""
Expand Down
2 changes: 1 addition & 1 deletion src/palimpzest/elements/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def asJSON(self):

def __str__(self):
keys = sorted(self.__dict__)
items = ("{}={!r}".format(k, str(self.__dict__[k])[:15]) for k in keys)
items = ("{}={!r}...".format(k, str(self.__dict__[k])[:15]) for k in keys)
return "{}({})".format(type(self).__name__, ", ".join(items))

def __eq__(self, other):
Expand Down
3 changes: 1 addition & 2 deletions src/palimpzest/execution/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@

from .execution import Execution
from .execution import Execution, SimpleExecution
80 changes: 80 additions & 0 deletions src/palimpzest/execution/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from palimpzest.policy import Policy, MaxQuality, UserChoice
from palimpzest.profiler import StatsProcessor
from palimpzest.datamanager import DataDirectory
import itertools

def emitNestedTuple(node, indent=0):
elt, child = node
Expand All @@ -10,6 +11,43 @@ def emitNestedTuple(node, indent=0):
emitNestedTuple(child, indent=indent+2)


def flatten_nested_tuples(nested_tuples):
"""
This function takes a nested iterable of the form (4,(3,(2,(1,())))) and flattens it to (1, 2, 3, 4).
"""
result = []
def flatten(item):
if isinstance(item, tuple):
if item: # Check if not an empty list
flatten(item[0]) # Process the head
flatten(item[1]) # Process the tail
else:
result.append(item)

flatten(nested_tuples)
result = list(result)
result.reverse()
return result[1:]

def graphicEmit(flatten_ops):
start = flatten_ops[0]
print(f" 0. {type(start).__name__} -> {start.outputSchema.__name__} \n")

for idx, (left, right) in enumerate(itertools.pairwise(flatten_ops)):
in_schema = left.outputSchema
out_schema = right.outputSchema
print(f" {idx+1}. {in_schema.__name__} -> {type(right).__name__} -> {out_schema.__name__} ", end="")
# if right.desc is not None:
# print(f" ({right.desc})", end="")
# check if right has a model attribute
if hasattr(right, 'model'):
print(f"\n Using {right.model}", end="")
if hasattr(right, 'filter'):
print(f'\n Filter: "{right.filter.filterCondition}"', end="")
print()
print(f" ({','.join(in_schema.fieldNames())[:15]}...) -> ({','.join(out_schema.fieldNames())[:15]}...)")
print()

class Execution:
"""An Execution is responsible for completing a query on a given Set.
Right now we assume the query is always SELECT * FROM set."""
Expand Down Expand Up @@ -65,6 +103,9 @@ def executeAndOptimize(self, verbose: bool=False, shouldProfile: bool=False):
# TODO: remove
if verbose:
import json
import os
if not os.path.exists('profiling-data'):
os.makedirs('profiling-data')
with open('profiling-data/eo-raw_profiling.json', 'w') as f:
sp = StatsProcessor(profileData)
json.dump(sp.profiling_data.to_dict(), f)
Expand Down Expand Up @@ -120,3 +161,42 @@ def executeAndOptimize(self, verbose: bool=False, shouldProfile: bool=False):
# 3. gather execution data
# 4. provide all cost estimates to physical operators

class SimpleExecution(Execution):
"""
This simple execution does not pre-sample the data and does not use cache nor profiling.
It just runs the query and returns the results.
"""

def executeAndOptimize(self, verbose: bool=False, shouldProfile: bool=False):
"""An execution of the rootset, subject to user-given policy."""
logicalTree = self.rootset.getLogicalTree()

# Ok now reoptimize the logical plan, this time with the sample data.
# (The data is not currently being used; let's see if this method can work first)
logicalTree = self.rootset.getLogicalTree()
candidatePlans = logicalTree.createPhysicalPlanCandidates(shouldProfile=shouldProfile)
if type(self.policy) == UserChoice:
print("-----AVAILABLE PLANS -----")
for idx, cp in enumerate(candidatePlans):
print(f"Plan {idx}: Time est: {cp[0]:.3f} -- Cost est: {cp[1]:.3f} -- Quality est: {cp[2]:.3f}")
print("Physical operator tree")
physicalOps = cp[3].dumpPhysicalTree()
emitNestedTuple(physicalOps)
print("----------")
# flatten_ops = flatten_nested_tuples(physicalOps)
# graphicEmit(flatten_ops)
# print("----------")

planTime, planCost, quality, physicalTree = self.policy.choose(candidatePlans)

if verbose:
print("----------")
print(f"Policy is: {self.policy}")
print(f"Chosen plan: Time est: {planTime:.3f} -- Cost est: {planCost:.3f} -- Quality est: {quality:.3f}")
ops = physicalTree.dumpPhysicalTree()
flatten_ops = flatten_nested_tuples(ops)
graphicEmit(flatten_ops)
# emitNestedTuple(ops)

return physicalTree

80 changes: 52 additions & 28 deletions src/palimpzest/generators/generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,31 @@ def _get_model(self) -> dsp.LM:

elif self.model_name in [Model.GEMINI_1.value]:
google_key = get_api_key('GOOGLE_API_KEY')
model = dspy.Google(model=self.model_name, api_key=google_key, return_dict=True)
model = dspy.Google(model=self.model_name, api_key=google_key)

else:
raise ValueError("Model must be one of the language models specified in palimpzest.constants.Model")

return model

def _get_attn(dspy_lm: dsp.LM):
def _get_usage_and_finish_reason(self, dspy_lm: dsp.LM):
"""
Parse and return the usage statistics and finish reason.
"""
usage, finish_reason = None, None
if self.model_name in [Model.GPT_3_5.value, Model.GPT_4.value]:
usage = dspy_lm.history[-1]['response']['usage']
finish_reason = dspy_lm.history[-1]['response']['choices'][-1]['finish_reason']
elif self.model_name in [Model.GEMINI_1.value]:
usage = {}
finish_reason = dspy_lm.history[-1]['response'][0]._result.candidates[0].finish_reason
elif self.model_name in [Model.MIXTRAL.value]:
usage = dspy_lm.history[-1]['response']['usage']
finish_reason = dspy_lm.history[-1]['response']['finish_reason']

return usage, finish_reason

def _get_attn(self, dspy_lm: dsp.LM):
"""
TODO
"""
Expand All @@ -92,11 +109,19 @@ def _get_answer_log_probs(self, dspy_lm: dsp.LM, answer: str) -> List[float]:
"""
# get log probabilities data structure
tokens, token_logprobs = None, None
if self.model_name in [Model.GPT_3_5.value, Model.GPT_4.value, Model.GEMINI_1.value]:

if self.model_name in [Model.GPT_3_5.value, Model.GPT_4.value]:
# [{'token': 'some', 'bytes': [12, 34, ...], 'logprob': -0.7198808, 'top_logprobs': []}}]
log_probs = dspy_lm.history[-1]['response']['choices'][-1]['logprobs']['content']
tokens = list(map(lambda elt: elt['token'], log_probs))
token_logprobs = list(map(lambda elt: elt['logprob'], log_probs))
elif self.model_name in [Model.GEMINI_1.value]:
return None
# TODO Google gemini does not provide log probabilities!
# https://github.com/google/generative-ai-python/issues/238
# tok_count = dspy_lm.llm.count_tokens(answer).total_tokens
# tokens = [""] * tok_count
# token_logprobs = [0] * len(tokens)
elif self.model_name in [Model.MIXTRAL.value]:
# reponse: dict_keys(['prompt', 'choices', 'usage', 'finish_reason', 'tokens', 'token_logprobs'])
tokens = dspy_lm.history[-1]['response']['tokens']
Expand All @@ -105,24 +130,24 @@ def _get_answer_log_probs(self, dspy_lm: dsp.LM, answer: str) -> List[float]:
raise ValueError("Model must be one of the language models specified in palimpzest.constants.Model")

# get indices of the start and end token for the answer
start_idx, end_idx = 0, 0
while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
if answer.startswith(tokens[start_idx]):
end_idx += 1
else:
start_idx += 1
end_idx = start_idx

# start_idx, end_idx = 0, 0
# while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
# if answer.startswith(tokens[start_idx]):
# end_idx += 1
# else:
# start_idx += 1
# end_idx = start_idx
# filter for log probs of tokens which appear in answer
answer_log_probs = token_logprobs[start_idx:end_idx+1]

# answer_log_probs = token_logprobs[start_idx:end_idx+1]
answer_log_probs = token_logprobs
# return those tokens log probabilities
return answer_log_probs

@retry(
wait=wait_exponential(multiplier=RETRY_MULTIPLIER, max=RETRY_MAX_SECS),
stop=stop_after_attempt(RETRY_MAX_ATTEMPTS),
after=log_attempt_number,
reraise=True,
)
def generate(self, context: str, question: str) -> GenerationOutput:
# fetch model
Expand All @@ -139,18 +164,15 @@ def generate(self, context: str, question: str) -> GenerationOutput:

# extract the log probabilities for the actual result(s) which are returned
answer_log_probs = self._get_answer_log_probs(dspy_lm, pred.answer)
usage, finish_reason = self._get_usage_and_finish_reason(dspy_lm)

# collect statistics on prompt, usage, and timing
stats = GenerationStats(
model_name=self.model_name,
llm_call_duration_secs=end_time - start_time,
prompt=dspy_lm.history[-1]['prompt'],
usage=dspy_lm.history[-1]['response']['usage'],
finish_reason=(
dspy_lm.history[-1]['response']['finish_reason']
if isinstance(dspy_lm, TogetherHFAdaptor)
else dspy_lm.history[-1]['response']['choices'][-1]['finish_reason']
),
usage=usage,
finish_reason=finish_reason,
answer_log_probs=answer_log_probs,
answer=pred.answer,
)
Expand Down Expand Up @@ -255,17 +277,19 @@ def _get_answer_log_probs(self, tokens: List[str], token_logprobs: List[float],
Filter and return the list of log probabilities for the tokens which appear in `answer`.
"""
# get indices of the start and end token for the answer
start_idx, end_idx = 0, 0
while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
if answer.startswith(tokens[start_idx]):
end_idx += 1
else:
start_idx += 1
end_idx = start_idx

# filter for log probs of tokens which appear in answer
answer_log_probs = token_logprobs[start_idx:end_idx+1]
# start_idx, end_idx = 0, 0
# while not answer.strip() == "".join(tokens[start_idx:end_idx+1]).strip():
# if answer.startswith(tokens[start_idx]):
# end_idx += 1
# else:
# start_idx += 1
# end_idx = start_idx

# filter for log probs of tokens which appear in answer
# answer_log_probs = token_logprobs[start_idx:end_idx+1]
answer_log_probs = token_logprobs

# return those tokens log probabilities
return answer_log_probs

Expand Down
5 changes: 2 additions & 3 deletions src/palimpzest/operators/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,8 @@ def _createPhysicalPlans(self, max: int=10, shouldProfile: bool=False) -> Physic
if os.getenv('TOGETHER_API_KEY') is not None:
models.extend([Model.MIXTRAL])

# TODO: uncomment once dspy pushes v2.4.1 to PyPI
# if os.getenv('GOOGLE_API_KEY') is not None:
# models.extend([Model.GEMINI_1])
if os.getenv('GOOGLE_API_KEY') is not None:
models.extend([Model.GEMINI_1])

assert len(models) > 0, "No models available to create physical plans! You must set at least one of the following environment variables: [OPENAI_API_KEY, TOGETHER_API_KEY, GOOGLE_API_KEY]"

Expand Down
12 changes: 0 additions & 12 deletions src/palimpzest/operators/physical.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,12 +244,6 @@ def __init__(self, outputSchema: Schema, source: PhysicalOp, model: Model, cardi
self.model = random.choice([Model.GPT_4V, Model.GEMINI_1V])
self.prompt_strategy = PromptStrategy.IMAGE_TO_TEXT

# TODO: remove once dspy pushes v2.4.1 to PyPI
if self.model == Model.GEMINI_1:
self.model = Model.GPT_4
elif self.model == Model.GEMINI_1V:
self.model = Model.GPT_4V

# NOTE: need to construct profiler after all fields used by self.opId() are set
self.profiler = Profiler(op_id=self.opId())
self.profile = self.profiler.iter_profiler
Expand Down Expand Up @@ -446,12 +440,6 @@ def __init__(self, outputSchema: Schema, source: PhysicalOp, model: Model, cardi
self.model = random.choice([Model.GPT_4V, Model.GEMINI_1V])
self.prompt_strategy = PromptStrategy.IMAGE_TO_TEXT

# TODO: remove once dspy pushes v2.4.1 to PyPI
if self.model == Model.GEMINI_1:
self.model = Model.GPT_4
elif self.model == Model.GEMINI_1V:
self.model = Model.GPT_4V

# NOTE: need to construct profiler after all fields used by self.opId() are set
self.profiler = Profiler(op_id=self.opId())
self.profile = self.profiler.iter_profiler
Expand Down
Loading

0 comments on commit 9a80b0b

Please sign in to comment.