From 5af95ea5db7653b9c9127fe3a9c407c43c58784e Mon Sep 17 00:00:00 2001 From: TTTTao725 Date: Fri, 24 May 2024 17:36:56 +0200 Subject: [PATCH] fix invalid datasets --- .../scripts/convert_ai_aktindsigt_to_jsonlgz.py | 6 +++--- .../scripts/convert_colossal_oscar_10_to_jsonlgz.py | 2 +- data-processing/scripts/convert_dr_facebook_to_jsonlgz.py | 8 ++++---- data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py | 8 ++++---- data-processing/scripts/convert_ft_speech_to_jsonlgz.py | 4 ++-- .../scripts/convert_scandi_reddit_to_jsonlgz.py | 2 +- data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/data-processing/scripts/convert_ai_aktindsigt_to_jsonlgz.py b/data-processing/scripts/convert_ai_aktindsigt_to_jsonlgz.py index d52871d3..cf6c2890 100644 --- a/data-processing/scripts/convert_ai_aktindsigt_to_jsonlgz.py +++ b/data-processing/scripts/convert_ai_aktindsigt_to_jsonlgz.py @@ -69,9 +69,9 @@ def main(): print("Reshaping into dolma format") df["id"] = df.apply(lambda row:'%s_%s' % (row.name[0],row.name[1]),axis=1) df["sha512"] = df.apply(lambda row:'%s' % row.name[1],axis=1) - df["source"] = "AI-aktindsigt" - df["added"] = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%S.000Z") - df["created"]: "1970-01-01T00:00:00.000Z,2024-04-01T00:00:00.000Z" # best guess creation time, between 1970 and release time + df["source"] = "ai_aktindsigt" + df["added"] = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d") + df["created"] = "1970-01-01, 2024-04-01" # best guess creation time, between 1970 and release time metadata_keys = ["url", "kommune", "sentence", "ppl_score", "sha512"] df["metadata"] = df.apply(lambda row: {k: row[k] for k in metadata_keys}, axis=1) diff --git a/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py b/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py index 08733402..e67ff9a3 100644 --- a/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py +++ b/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py @@ -93,7 +93,7 @@ def process_one(output_dir: Path, root_path: Path, input_path: Path) -> None: "id": str(obj["warc_headers"]["warc-record-id"]), # Copy metadata id to root "text": obj["content"], "source": "colossal_oscar_1.0", - "added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + "added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d"), "created": obj["warc_headers"]["warc-date"], # Copy metadata to root "metadata": obj["metadata"], } diff --git a/data-processing/scripts/convert_dr_facebook_to_jsonlgz.py b/data-processing/scripts/convert_dr_facebook_to_jsonlgz.py index b2f3f21d..b70d6ad7 100644 --- a/data-processing/scripts/convert_dr_facebook_to_jsonlgz.py +++ b/data-processing/scripts/convert_dr_facebook_to_jsonlgz.py @@ -9,7 +9,7 @@ from zoneinfo import ZoneInfo def convert_file(input_path: Path, output_dir: Path): - added_time = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"), + added_time = datetime.datetime.now().strftime("%Y-%m-%d"), with tarfile.open(input_path, "r") as tarf: for member in tarf.getmembers(): iobytes = tarf.extractfile(member) @@ -19,9 +19,9 @@ def convert_file(input_path: Path, output_dir: Path): reader = csv.DictReader(iotext) for i, row in enumerate(reader): tz = ZoneInfo("Europe/Copenhagen") - created_date = datetime.datetime.strptime(row["date"], "%Y-%m-%d %H:%M:%S") - tz_name = tz.tzname(created_date) or "" - created_date_str = created_date.strftime("%Y-%m-%dT%H:%M:%S.000") + tz_name + created_date = datetime.datetime.strptime(row["date"], "%Y-%m-%d") + # tz_name = tz.tzname(created_date) or "" + created_date_str = created_date.strftime("%Y-%m-%d") + ", " + added_time new_obj = { "id": str(i), "text": row["text"], diff --git a/data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py b/data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py index 77615b5c..d1ec97dd 100644 --- a/data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py +++ b/data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py @@ -16,11 +16,11 @@ """ import datetime -from datasets import Dataset, DatasetDict, load_dataset +from datasets import Dataset, DatasetDict, load_dataset # type: ignore -eu_start_time = "1993-11-01T00:00:00.000Z" -date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z") -eu_time_span = ",".join([eu_start_time, date_added]) +eu_start_time = "1993-11-01" +date_added = datetime.datetime.now().strftime("%Y-%m-%d") +eu_time_span = ", ".join([eu_start_time, date_added]) def reformat_dataset(ds: Dataset) -> Dataset: diff --git a/data-processing/scripts/convert_ft_speech_to_jsonlgz.py b/data-processing/scripts/convert_ft_speech_to_jsonlgz.py index e5479ce3..314abc0a 100644 --- a/data-processing/scripts/convert_ft_speech_to_jsonlgz.py +++ b/data-processing/scripts/convert_ft_speech_to_jsonlgz.py @@ -7,8 +7,8 @@ import sys def convert_file(input_path: Path, output_dir: Path): - added_time = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"), - created_date_str = "2017-01-01T00:00:00.000Z, 2024-01-01T00:00:00.000Z" + added_time = datetime.datetime.now().strftime("%Y-%m-%d"), + created_date_str = "2017-01-01, 2024-01-01" with tarfile.open(input_path, "r") as tarf: for member in tarf.getmembers(): iobytes = tarf.extractfile(member) diff --git a/data-processing/scripts/convert_scandi_reddit_to_jsonlgz.py b/data-processing/scripts/convert_scandi_reddit_to_jsonlgz.py index 0f867553..f83cb8db 100644 --- a/data-processing/scripts/convert_scandi_reddit_to_jsonlgz.py +++ b/data-processing/scripts/convert_scandi_reddit_to_jsonlgz.py @@ -15,7 +15,7 @@ from datasets import Dataset, DatasetDict, load_dataset # type: ignore reddit_time = "2005-12-01, 2022-11-01" -date_added = datetime.datetime.utcnow().strftime("%Y-%m-%d") +date_added = datetime.datetime.now().strftime("%Y-%m-%d") def reformat_dataset(ds: Dataset) -> Dataset: diff --git a/data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py b/data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py index bfee2b13..43a8d613 100644 --- a/data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py +++ b/data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py @@ -51,7 +51,7 @@ def main(): "text": entry["text"], "source": "scandi-wiki", "added": time_added, - "created": wiki_created + "," + time_added, + "created": wiki_created + ", " + time_added, "metadata": {"url": entry["url"], "title": entry["title"], "language": Path(member.name).stem} } json.dump(output_entry, outfile)