Merging Dataset Reranking changes(#383)

* dataset reranking changes, no verify * added reranking.csv * latest changes for reranking * added tests * minor changes * seprating changes for dataset index file creation * minor fixes to the prompt * changes for using dataset index file * Remove redundant file * undo minor testing change * PR changes * final changes * fixing CI test * Added reranking dataset index to gitignore * removed print stmts * updated gitignore * Update prompt2model/dataset_retriever/description_dataset_retriever.py Co-authored-by: Graham Neubig <neubig@gmail.com> * Update prompt2model/utils/parse_responses.py Co-authored-by: Graham Neubig <neubig@gmail.com> * Update scripts/dataset_index/retrieve_dataset_info.py Co-authored-by: Graham Neubig <neubig@gmail.com> * Update scripts/dataset_index/retrieve_dataset_info.py Co-authored-by: Graham Neubig <neubig@gmail.com> * requested review changes * lint changes --------- Co-authored-by: Graham Neubig <neubig@gmail.com>
neulab · Jan 12, 2024 · f2eabc1 · f2eabc1
1 parent 947b636
commit f2eabc1
Show file tree

Hide file tree

Showing 12 changed files with 1,337 additions and 236 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,7 @@ cached_generated_dataset/
 generated_dataset/
 huggingface_data/huggingface_datasets/dataset_index.json
 huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index
+huggingface_data/huggingface_datasets/reranking_dataset_index.json
 huggingface_data/huggingface_models/
 retrieved_dataset_dict/
 status.yaml

diff --git a/prompt2model/dataset_retriever/column_selection_prompt.py b/prompt2model/dataset_retriever/column_selection_prompt.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations  # noqa FI58
 
-import json
-
 METAPROMPT_BASE = """Your objective is to carefully analyze the task and the dataset mentioned, and decide whether the columns are relevant input, relevant output, irrelevant for the given task, or if it is ambiguous. There should be at most one output column. It is possible to have no relevant columns, in which case return the input and output column as empty lists.  Answer in a json format, with the following keys: input, output, irrelevant, ambiguous"""  # noqa: E501
 METAPROMPT_EXAMPLES = [
     (
@@ -90,19 +88,6 @@
 ENDING_LINE = "After seeing these examples with the required columns, please provide the relevant columns for this context:"  # noqa: E501
 
 
-def truncate_row(example_row: dict, max_length=50) -> str:
-    """Truncate the row before displaying if it is too long."""
-    truncated_row = {}
-    for key in example_row.keys():
-        curr_row = json.dumps(example_row[key])
-        truncated_row[key] = (
-            curr_row
-            if len(curr_row) <= max_length - 3
-            else curr_row[:max_length] + "..."
-        )
-    return json.dumps(truncated_row)
-
-
 def build_input(
     instruction: str,
     dataset_name: str,
@@ -116,7 +101,7 @@ def build_input(
         dataset_name=dataset_name,
         dataset_description=dataset_description,
         dataset_columns=dataset_columns,
-        sample_row=truncate_row(sample_row),
+        sample_row=sample_row,
     )
     input_prompt = SINGLE_DEMONSTRATION_TEMPLATE.format(
         prompt=input_prompt, columns=""