Yale-LILY · StephenYin01 · Mar 3, 2022 · Mar 9, 2022 · Mar 15, 2022 · Mar 15, 2022
diff --git a/.gitignore b/.gitignore
@@ -128,9 +128,16 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# sandboxes
+**sandbox.py
+parsing/sandbox*
+
 # defined by Ansong
 .venv
 data/
 debug-tmp/
 wandb/
 results/
+
+# defined by Troy
+.DS_Store
diff --git a/execution/spider_execution.py b/execution/spider_execution.py
@@ -1,6 +1,9 @@
 import sqlite3
 import pandas as pd
 import numpy as np
+import re
+import keyword
+import math
 
 from typing import List, Dict, Any, Union, Tuple
 
@@ -29,8 +32,11 @@ def spider_execution_sql(sql: str, conn: sqlite3.Connection, return_error_msg: b
 def db_to_df_dict(conn: sqlite3.Connection) -> Dict[str, pd.DataFrame]:
     df_dict = {}
     for table_name in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall():
-        df_dict[table_name[0]] = pd.read_sql_query(f"SELECT * FROM {table_name[0]}", conn)
-        df_dict[table_name[0]].rename(columns=lambda x: x.lower(), inplace=True)
+        # modify to change everything including labels lower case
+        df = pd.read_sql_query(f"SELECT * FROM {table_name[0]}", conn)
+        df = df.applymap(lambda s: s.lower() if type(s) == str else s)
+        df_dict[table_name[0].lower()] = df
+        df_dict[table_name[0].lower()].rename(columns=lambda x: x.lower(), inplace=True)
     return df_dict
 
 def spider_execution_py(code: str, df_dict: Dict[str, pd.DataFrame], return_error_msg: bool = False) -> Any:
@@ -39,8 +45,22 @@ def spider_execution_py(code: str, df_dict: Dict[str, pd.DataFrame], return_erro
     # use the tables as part of the code context
     table_vars_code = "import pandas as pd\n"
     for table_name in df_dict.keys():
-        table_vars_code += f"# {' '.join(list(df_dict[table_name].columns))}\n{table_name} = df_dict['{table_name}']\n"
-    code = table_vars_code + "\n" + code
+        # table names may be reserved words like "class"
+        if table_name in keyword.kwlist:
+            table_vars_code += f"_{table_name} = df_dict['{table_name}']\n"
+            # but we have to make sure that table columns are not changed
+            # code = code.replace(table_name, f"_{table_name}")
+            code = re.sub("((?<!_)class(?!_))", "_class", code)
+        else:
+            table_vars_code += f"{table_name} = df_dict['{table_name}']\n"
+
+    # lower everything in quotes
+    code = re.sub(r"'(.*?)'", lambda p: f"'{p.group(1).lower()}'", code)
+    # move select statements after sorting or drop_dup
+    # TODO further processing needed, case 784, 1721,
+    #  and select, drop_duplicate, followed by sorting
+    code = re.sub(r"(.*(?<!\[))(\[\[?.*?\]?\])(\.sort_values.*)", r"\1\3\2", code)
+    code = table_vars_code + "\n" + f"answer = {code}"
 
     # execute the code
     try:
@@ -58,43 +78,77 @@ def spider_execution_py(code: str, df_dict: Dict[str, pd.DataFrame], return_erro
         else:
             return None
 
-def flatten_list_of_list(l: List[List[Any]]) -> List[Any]:
+def flatten_list_of_list(l: List[List[Any]], sort: bool = False) -> List[Any]:
     result = []
     for sublist in l:
         if isinstance(sublist, list) or isinstance(sublist, tuple):
             result.extend(sublist)
         else:
             result.append(sublist)
 
-    return result
+    if sort:
+        result.sort(key = str)
+        return result
+    else:
+        return result
 
-def spider_answer_eq(prediction: Union[pd.DataFrame, pd.Series, List[Tuple[Any]]], 
-                     gold_answer: Union[List[Tuple[Any]], int]) -> bool:
+def list_to_lower_case(l: List[Any]):
+    result = []
+    for object in l:
+        if isinstance(object, str):
+            result.append(object.lower())
+        else:
+            result.append(object)
+    return result
 
-    if isinstance(prediction, int) or isinstance(prediction, float):
+def compare_lists(l1: List[Any], l2: List[Any]) -> bool:
+    if len(l1) != len(l2):
+        return False
+    else:
+        for i in range(len(l1)):
+            if type(l1[i]) == float:
+                if not math.isclose(l1[i], l2[i]):
+                    return False
+                else:
+                    continue
+            elif l1[i] != l2[i]:
+                return False
+        return True
+
+def spider_answer_eq(prediction: Union[pd.DataFrame, pd.Series, List[Tuple[Any]]],
+                     gold_answer: Union[List    [Tuple[Any]], int],
+                     sort: bool = False) -> bool:
+
+    if isinstance(prediction, int) or isinstance(prediction, float) or (not isinstance(prediction, list) and not isinstance(prediction, pd.DataFrame) and not isinstance(prediction, np.ndarray) and not isinstance(prediction, tuple) and np.issubdtype(prediction, np.integer)):
         prediction = [prediction]
-    
+
     if isinstance(prediction, list) or isinstance(prediction, np.ndarray):
         if isinstance(gold_answer, list):
-            gold_flattened = flatten_list_of_list(gold_answer)
-            pred_flattened = flatten_list_of_list(prediction)
-            result = pred_flattened == gold_flattened
+            gold_flattened = list_to_lower_case(
+                flatten_list_of_list(gold_answer, sort))
+            pred_flattened = flatten_list_of_list(prediction, sort)
+            result = compare_lists(pred_flattened, gold_flattened)
         else:
             result = False
     elif isinstance(prediction, pd.DataFrame):
         if isinstance(gold_answer, list):
-            # convert the dataframe to a list of tuples and check
-            pred_list = flatten_list_of_list(list(prediction.itertuples(index=False, name=None)))
-            gold_list = flatten_list_of_list(gold_answer)
-            result = pred_list == gold_list
+            # we include the index only when it exists
+            pred_list = flatten_list_of_list(list(prediction.itertuples(
+                index=bool(prediction.index.name), name=None)), sort)
+            gold_list = list_to_lower_case(flatten_list_of_list(gold_answer, sort))
+            result = compare_lists(pred_list, gold_list)
         else:
             result = False
     elif isinstance(prediction, pd.Series):
         if isinstance(gold_answer, list):
             # convert the series to a list of tuples and check
-            pred_list = flatten_list_of_list(prediction.tolist())
-            gold_list = flatten_list_of_list(gold_answer)
-            result = pred_list == gold_list 
+            # we include the index only when it exists
+            if prediction.index.name:
+                pred_list = flatten_list_of_list(list(prediction.items()), sort)
+            else:
+                pred_list = flatten_list_of_list(prediction.tolist(), sort)
+            gold_list = list_to_lower_case(flatten_list_of_list(gold_answer, sort))
+            result = compare_lists(pred_list, gold_list)
         else:
             result = False
     else:

diff --git a/parsing/clean_query.py b/parsing/clean_query.py
@@ -0,0 +1,36 @@
+import re
+from helpers import trim_front_and_back
+
+
+# TODO:
+# - more robust parenthesis handling
+# - more robust extra space removal?
+
+
+# sql2pandas requires single quotes in SQL queries
+def replace_quotes(sql_query):
+    return sql_query.replace("\"", "\'")
+
+
+# Remove extra spaces
+def remove_consecutive_spaces(sql_query):
+    sql_query = sql_query.strip()
+    sql_query = re.sub(r"\s+", " ", sql_query)
+    sql_query = re.sub(r"\( ", "(", sql_query)
+    return sql_query
+
+
+# Add semi-colon at end of SQL query for consistency
+def add_semicolon(sql_query):
+    return sql_query if sql_query[-1:] == ";" else sql_query + ";"
+
+
+# Basic string preprocessing/cleanup for SQL queries
+def basic_clean_query(sql_query):
+    sql_query = replace_quotes(sql_query)
+    sql_query = remove_consecutive_spaces(sql_query)
+    # TODO: ensure balance for front/back parentheses
+    sql_query = trim_front_and_back(sql_query, "(", ")")
+
+    sql_query = add_semicolon(sql_query)
+    return sql_query
diff --git a/parsing/data.json b/parsing/data.json
diff --git a/parsing/helpers.py b/parsing/helpers.py
@@ -0,0 +1,190 @@
+from typing import Tuple
+
+
+# Removes any characters in `chars_to_remove` from the front of `s`
+def trim_front(s, chars_to_remove):
+    while s[0] in chars_to_remove:
+        s = s[1:]
+    return s
+
+
+# Removes any characters in `chars_to_remove` from the back of `s`
+def trim_back(s, chars_to_remove):
+    while s[-1:] in chars_to_remove:
+        s = s[:-1]
+    return s
+
+
+# Removes characters like parentheses from front/end of `s`
+def trim_front_and_back(s, char_front, char_back):
+    while s[0] == char_front and s[-1:] == char_back:
+        s = s[1:-1]
+    return s
+
+
+# Find corresponding balanced closing parenthesis for opening parenthesis at index `open_idx-1`
+def find_closing_parenthesis(s, open_idx):
+    if s[open_idx-1] != '(':
+        print('[find_closing_parenthesis] input open_idx error')
+        return -1
+
+    idx = open_idx
+    ct_open = 0
+    while idx < len(s):
+        if s[idx] == '(':
+            ct_open += 1
+        elif s[idx] == ')':
+            if ct_open == 0:
+                return idx
+            ct_open -= 1
+
+        idx += 1
+
+    return -1
+
+
+# Determines if first non-whitespace char in `partial_sql_query` is "SELECT"
+def is_next_token_select(partial_sql_query):
+    return partial_sql_query.strip().find("SELECT") == 0
+
+
+def is_idx_at_token_start(sql_query: str, idx: int):
+    if idx >= len(sql_query):
+        return False
+
+    if sql_query[idx] == " ":
+        print("[is_idx_at_token_start] idx not in word")
+        return False
+
+    if idx > 0 and not sql_query[idx-1] == " ":
+        print("[is_idx_at_token_start] idx not at start of token")
+        return False
+
+    return True
+
+
+def get_next_token_idx(sql_query: str, idx: int):
+    while idx < len(sql_query) and sql_query[idx] != " ":
+        idx += 1
+
+    while idx < len(sql_query) and sql_query[idx] == " ":
+        idx += 1
+
+    return idx
+
+
+def get_prev_token(sql_query: str, idx: int):
+    if idx == 0:
+        print("[get_prev_token] no prev token")
+        return None
+
+    if not is_idx_at_token_start(sql_query, idx):
+        return None
+
+    finish_idx = idx - 1
+    while finish_idx - 1 >= 0 and sql_query[finish_idx-1] == " ":
+        finish_idx -= 1
+
+    start_idx = finish_idx - 1
+    while start_idx - 1 >= 0 and sql_query[start_idx - 1] != " ":
+        start_idx -= 1
+
+    return sql_query[start_idx:finish_idx]
+
+def get_second_last_token(sql_query: str):
+    length = len(sql_query)
+    if length < 2:
+        print("[get_second_last_token] no second last token")
+        return None
+
+    finish_idx = length - 1
+    while finish_idx > 0 and sql_query[finish_idx] != " ":
+        finish_idx -= 1
+
+    start_idx = finish_idx - 1
+    while start_idx > 0 and sql_query[start_idx] != " ":
+        start_idx -= 1
+
+    return sql_query[start_idx:finish_idx].strip()
+
+def get_cur_token(sql_query: str, idx: int):
+    if not is_idx_at_token_start(sql_query, idx):
+        return None
+
+    finish_idx = idx
+    while finish_idx < len(sql_query) and sql_query[finish_idx] != " ":
+        finish_idx += 1
+
+    return sql_query[idx:finish_idx]
+
+
+def get_next_token(sql_query: str, idx: int):
+    if idx >= len(sql_query) - 1:
+        print("[get_prev_token] no next token")
+        return None
+
+    if not is_idx_at_token_start(sql_query, idx):
+        return None
+
+    start_idx = get_next_token_idx(sql_query, idx)
+    return get_cur_token(sql_query, start_idx)
+
+
+def remove_prev_token(s: str, idx: int) -> Tuple[str, int]:
+    """Removes previous token from idx, where idx is at start of token.
+
+    Args:
+        s (str): String from which to remove previous token
+        idx (int): Index of start of token, where previous token from idx is removed.
+
+    Returns:
+        Tuple[str, int]: Redacted string, and new position of idx
+    """
+    if idx == 0:
+        print("[get_prev_token] no prev token")
+        return None
+
+    if not is_idx_at_token_start(s, idx):
+        return None
+
+    finish_idx = idx - 1
+    while finish_idx - 1 >= 0 and s[finish_idx-1] == " ":
+        finish_idx -= 1
+
+    start_idx = finish_idx - 1
+    while start_idx - 1 >= 0 and s[start_idx - 1] != " ":
+        start_idx -= 1
+
+    return s[:start_idx] + s[finish_idx:], idx - (finish_idx - start_idx)
+
+
+def extract_table_column(join_on_col: str) -> str:
+    """For a table column of the form TABLE.COLUMN (as in JOIN), extract COLUMN.
+
+    Args:
+        join_on_col (str): Full name of column, potentially with table specified.
+
+    Returns:
+        str: Extracted column (without specified table, if specified).
+    """
+    dot_idx = join_on_col.find(".")
+    return join_on_col if dot_idx < 0 else join_on_col[dot_idx+1:]
+
+
+def get_first_token(s: str) -> str:
+    idx = s.find(" ")
+    if idx < 0:
+        idx = len(s)
+    return s[:idx]
+
+def subtract_sql_to_pandas(sql: str, simple: bool) -> str:
+    """If simple subtract, removes the SELECT and parenthesis and ; from the sql for a subtract sql 
+    Otherwise, replaces subtraction with pandas and returns the new sql with subtraction replaced
+
+    TODO fill args
+    """
+    if simple:
+        ret = sql.replace("SELECT ", "").replace(";", "").replace("(", "").replace(")", "")
+    else:
+        ret = None
+    return ret