Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sbmaruf/project instruct data using psrc #1

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,33 @@ conda activate instructmultilingual
pip install -r requirements.txt
```

## Dataset Projection

### promptsource
sbmaruf marked this conversation as resolved.
Show resolved Hide resolved

```shell
DUMP_FOLDER='' # fill this with your desired address
SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc
mkdir -p $SRC_DATA_FOLDER
mkdir -p $SRC_DATA_FOLDER/cache

python data/project_from_psrc.py \
--dataset-name-or-paths glue glue glue glue glue \
--dataset-configs cola sst2 mrpc qqp stsb \
--prompt-templates-configs None None None None None \
--cache-dir $SRC_DATA_FOLDER/cache \
--output-dir $SRC_DATA_FOLDER \
--highlight-variables \
--add-source-metadata \
--num-proc 16
```

See the details of the arguments by,

```shell
python data/project_from_psrc.py --help
```

## Translate

```shell
Expand Down
226 changes: 226 additions & 0 deletions data/project_from_psrc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
import os
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious why the name is project_from_psrc.py ? 👀

Copy link
Contributor Author

@sbmaruf sbmaruf Mar 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Project from promptsource. If it's not clear we can rename it as project_from_promptsource.py. But I like the psrc short form. :D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think project_from_promptsource.py is better, let's change it pls

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! :)

import json
import argparse
import datasets
from tqdm import tqdm
import concurrent.futures
from tqdm.contrib.concurrent import process_map
from promptsource.templates import DatasetTemplates



def export_dataset(
dataset_output_dir,
dataset_name,
dataset_config,
psrc_prompt_template_signature,
prompt,
dataset,
add_source_metadata=False,
highlight_variables=False,
AmrMKayid marked this conversation as resolved.
Show resolved Hide resolved
):
AmrMKayid marked this conversation as resolved.
Show resolved Hide resolved
splits = list(dataset.keys())
prompt_name = prompt.get_name()
for split in splits:
dataset_split = dataset[split]
json_data_path = os.path.join(dataset_output_dir, split)
os.makedirs(json_data_path, exist_ok=True)
json_data_path = os.path.join(
json_data_path,
(psrc_prompt_template_signature + "." + prompt_name).replace("/", "_").replace(" ", "_")
+ ".jsonl",
)
with open(json_data_path, "w", encoding="utf-8") as file_ptr:
total_num_sample = len(dataset_split)
for _id, sample in tqdm(
enumerate(dataset_split),
total=total_num_sample,
desc="{}_{}_{}_{}_{}".format(
dataset_name, dataset_config, split, psrc_prompt_template_signature, prompt_name
),
):
projected_sample = prompt.apply(sample, highlight_variables=False)
answer_choice_list = prompt.get_answer_choices_list(sample)
if len(projected_sample) != 2:
continue
source, target = projected_sample
projected_sample_with_metadata = {
"id": _id,
"source": source,
"target": target,
"psrc_prompt_template_signature": psrc_prompt_template_signature,
"prompt_name": prompt_name,
"prompt_answer_choice_list": answer_choice_list,
"dataset_name": dataset_name,
"dataset_config": dataset_config,
"split": split,
"metrics": prompt.metadata.metrics,
"original_task": prompt.metadata.original_task,
"choices_in_prompt": prompt.metadata.choices_in_prompt,
"languages": prompt.metadata.languages,
}
if highlight_variables:
new_projected_sample = prompt.apply(
sample, highlight_variables=highlight_variables
)
source, target = new_projected_sample
projected_sample_with_metadata["highlighted_source"] = source
projected_sample_with_metadata["highlighted_target"] = target

if add_source_metadata:
for k, v in sample.items():
k = "src_meta_{}".format(k)
assert k not in projected_sample_with_metadata
projected_sample_with_metadata[k] = v

file_ptr.write(json.dumps(projected_sample_with_metadata))
file_ptr.write("\n")
return "Completed:: {} !".format(json_data_path)


def invoke_none(lst):
for idx, val in enumerate(lst):
if val == "None" or val == "none" or val == "null" or val == "":
lst[idx] = None
return lst


def main():

parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset-name-or-paths",
nargs="+",
default="glue",
help="""A list of paths (seperated by space) to a huggingface dataset (or huggingface dataset singnature, i.e, super_glue, squad_v2).
A supported list can be found at https://github.com/bigscience-workshop/promptsource/tree/main/promptsource/templates .
In the case of a new dataset, it is possible to apply a different prompt template to to a new dataset it as long as
the JSON structure of the dataset is the same as what is required in the original prompt template."""
)
parser.add_argument(
"--dataset-configs",
nargs="+",
default=None,
help="""A list of huggingface dataset-config. `--dataset-name-or-paths` along with `--dataset-configs` defines a data file.
If there is no `--dataset-configs` in huggingface, use None. The first argument in the `--dataset-name-or-paths` refers to the
first argument of the `--dataset-configs`. There should be an equal number of argument in `--dataset-name-or-paths` and `--dataset-configs`."""
)
parser.add_argument(
"--prompt-templates-configs",
nargs="+",
default=None,
help="""Name of the prompt template. Please use `None` if you want to project with all the prompt templates.
The first argument in the `--dataset-name-or-paths` & `--dataset-configs` refers to the
first argument of the `--prompt-templates-configs`. There should be an equal number of argument in
`--dataset-name-or-paths`, `--dataset-configs` and `--prompt-templates-configs`"""
)
parser.add_argument(
"--cache-dir",
type=str,
required=True,
help="Path to the cache dir of huggingface datasets. (The directory may require very large space.)",
)
parser.add_argument(
"--output-dir", type=str, required=True,
help="Path to the output dir where the projected data will be stored."
)
parser.add_argument(
"--num-proc",
type=int,
default=9,
help="Total number of parallel process."
)
parser.add_argument(
"--add-source-metadata",
action="store_true",
help="""
Add all the metadata from source dataset. This will create new keys names `src_meta_{original_keys}`
where this `original_keys` are all the keys from the original dataset key names (a.k.a column name).
These variable are kept with the completion so that we can recover the projection again if needed.
""",
)
parser.add_argument(
"--highlight-variables",
action="store_true",
help="""Highlight token that are coming from the prompts and original dataset."
This feature can be used to differentiate prompt tokens and input tokens.""",
)
args = parser.parse_args()

assert len(args.dataset_name_or_paths) == len(args.dataset_configs)
assert len(args.dataset_name_or_paths) == len(args.prompt_templates_configs)

invoke_none(args.dataset_name_or_paths)
invoke_none(args.dataset_configs)
invoke_none(args.prompt_templates_configs)

prompted_sample_gen_io_tuple_list = []
# loading and caching each of the dataset & creating multiprocessor i/o for doing projection.
for (dataset_name_or_path, dataset_config, prompt_template_config) in zip(
args.dataset_name_or_paths, args.dataset_configs, args.prompt_templates_configs
):
dataset = datasets.load_dataset(dataset_name_or_path, dataset_config, cache_dir=args.cache_dir)
psrc_prompt_template_signature = prompt_template_config
if psrc_prompt_template_signature is None:
if dataset_config is None:
psrc_prompt_template_signature = "{}".format(dataset_name_or_path)
else:
psrc_prompt_template_signature = "{}/{}".format(dataset_name_or_path, dataset_config)
dataset_output_dir = os.path.join(args.output_dir, dataset_name_or_path)
os.makedirs(dataset_output_dir, exist_ok=True)
if dataset_config is not None:
dataset_output_dir = os.path.join(dataset_output_dir, dataset_config)
os.makedirs(dataset_output_dir, exist_ok=True)
prompt_templates = DatasetTemplates(psrc_prompt_template_signature)
prompt_names = list(prompt_templates.name_to_id_mapping.keys())
for prompt_name in prompt_names:
prompt_template = prompt_templates[prompt_name]
prompted_sample_gen_io_tuple = (dataset_output_dir,
dataset_name_or_path,
dataset_config,
psrc_prompt_template_signature,
prompt_template,
dataset,
args.add_source_metadata,
args.highlight_variables)
prompted_sample_gen_io_tuple_list.append(prompted_sample_gen_io_tuple)

# Test a single process run
# export_dataset(
# prompted_sample_gen_io_tuple_list[0][0],
# prompted_sample_gen_io_tuple_list[0][1],
# prompted_sample_gen_io_tuple_list[0][2],
# prompted_sample_gen_io_tuple_list[0][3],
# prompted_sample_gen_io_tuple_list[0][4],
# prompted_sample_gen_io_tuple_list[0][5],
# prompted_sample_gen_io_tuple_list[0][6],
# prompted_sample_gen_io_tuple_list[0][7],
# )
sbmaruf marked this conversation as resolved.
Show resolved Hide resolved

# Projecting data using multiprocessing. It's recommended to use large number of CPU machine. set up `--num-proc` accrodingly.
num_proc = min(args.num_proc, len(prompted_sample_gen_io_tuple_list))

with concurrent.futures.ProcessPoolExecutor(
max_workers=num_proc
) as executor:
for _out in tqdm(
executor.map(
export_dataset,
[prompted_sample_gen_io[0] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_output_dir
[prompted_sample_gen_io[1] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_name_or_path
[prompted_sample_gen_io[2] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset_config
[prompted_sample_gen_io[3] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # psrc_prompt_template_signature
[prompted_sample_gen_io[4] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # prompt_template
[prompted_sample_gen_io[5] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # dataset
[prompted_sample_gen_io[6] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.add_source_metadata
[prompted_sample_gen_io[7] for prompted_sample_gen_io in prompted_sample_gen_io_tuple_list], # args.highlight_variables
),
total=len(args.dataset_name_or_paths),
):
try:
print(_out)
except Exception as emsg:
print("Exception msg: {}".format(emsg))

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,4 @@ wrapt==1.14.1
xxhash==3.2.0
yapf==0.32.0
yarl==1.8.2
promptsource==0.2.3
14 changes: 14 additions & 0 deletions scripts/project_from_psrc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
DUMP_FOLDER='./raw'
SRC_DATA_FOLDER=$DUMP_FOLDER/projection_from_psrc
mkdir -p $SRC_DATA_FOLDER
mkdir -p $SRC_DATA_FOLDER/cache

python data/project_from_psrc.py \
--dataset-name-or-paths glue glue glue glue glue \
--dataset-configs cola sst2 mrpc qqp stsb \
--prompt-templates-configs None None None None None \
--cache-dir $SRC_DATA_FOLDER/cache \
--output-dir $SRC_DATA_FOLDER \
--highlight-variables \
--add-source-metadata \
--num-proc 16