Merge pull request #277 from centre-for-humanities-computing/fix_data…

…sets fix invalid datasets
centre-for-humanities-computing · May 28, 2024 · 6393778 · 6393778
2 parents 57bd787 + 5af95ea
commit 6393778
Show file tree

Hide file tree

Showing 7 changed files with 16 additions and 16 deletions.
diff --git a/data-processing/scripts/convert_ai_aktindsigt_to_jsonlgz.py b/data-processing/scripts/convert_ai_aktindsigt_to_jsonlgz.py
@@ -69,9 +69,9 @@ def main():
     print("Reshaping into dolma format")
     df["id"] = df.apply(lambda row:'%s_%s' % (row.name[0],row.name[1]),axis=1)
     df["sha512"] = df.apply(lambda row:'%s' % row.name[1],axis=1)
-    df["source"] = "AI-aktindsigt"
-    df["added"] = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%S.000Z")
-    df["created"]: "1970-01-01T00:00:00.000Z,2024-04-01T00:00:00.000Z" # best guess creation time, between 1970 and release time
+    df["source"] = "ai_aktindsigt"
+    df["added"] = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d")
+    df["created"] = "1970-01-01, 2024-04-01" # best guess creation time, between 1970 and release time
 
     metadata_keys = ["url", "kommune", "sentence", "ppl_score", "sha512"]
     df["metadata"] = df.apply(lambda row: {k: row[k] for k in metadata_keys}, axis=1)

diff --git a/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py b/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py
@@ -93,7 +93,7 @@ def process_one(output_dir: Path, root_path: Path, input_path: Path) -> None:
                 "id": str(obj["warc_headers"]["warc-record-id"]), # Copy metadata id to root
                 "text": obj["content"],
                 "source": "colossal_oscar_1.0",
-                "added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+                "added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d"),
                 "created": obj["warc_headers"]["warc-date"], # Copy metadata to root
                 "metadata": obj["metadata"],
             }

diff --git a/data-processing/scripts/convert_dr_facebook_to_jsonlgz.py b/data-processing/scripts/convert_dr_facebook_to_jsonlgz.py
@@ -9,7 +9,7 @@
 from zoneinfo import ZoneInfo
 
 def convert_file(input_path: Path, output_dir: Path):
-    added_time = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+    added_time = datetime.datetime.now().strftime("%Y-%m-%d"),
     with tarfile.open(input_path, "r") as tarf:
         for member in tarf.getmembers():
             iobytes = tarf.extractfile(member)
@@ -19,9 +19,9 @@ def convert_file(input_path: Path, output_dir: Path):
                 reader = csv.DictReader(iotext)
                 for i, row in enumerate(reader):
                     tz = ZoneInfo("Europe/Copenhagen")
-                    created_date = datetime.datetime.strptime(row["date"], "%Y-%m-%d %H:%M:%S")
-                    tz_name = tz.tzname(created_date) or ""
-                    created_date_str = created_date.strftime("%Y-%m-%dT%H:%M:%S.000") + tz_name
+                    created_date = datetime.datetime.strptime(row["date"], "%Y-%m-%d")
+                    # tz_name = tz.tzname(created_date) or ""
+                    created_date_str = created_date.strftime("%Y-%m-%d") + ", " + added_time
                     new_obj = {
                         "id": str(i),
                         "text": row["text"],

diff --git a/data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py b/data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py
@@ -16,11 +16,11 @@
 """
 import datetime
 
-from datasets import Dataset, DatasetDict, load_dataset
+from datasets import Dataset, DatasetDict, load_dataset # type: ignore
 
-eu_start_time = "1993-11-01T00:00:00.000Z"
-date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
-eu_time_span = ",".join([eu_start_time, date_added])
+eu_start_time = "1993-11-01"
+date_added = datetime.datetime.now().strftime("%Y-%m-%d")
+eu_time_span = ", ".join([eu_start_time, date_added])
 
 
 def reformat_dataset(ds: Dataset) -> Dataset:

diff --git a/data-processing/scripts/convert_ft_speech_to_jsonlgz.py b/data-processing/scripts/convert_ft_speech_to_jsonlgz.py
@@ -7,8 +7,8 @@
 import sys
 
 def convert_file(input_path: Path, output_dir: Path):
-    added_time = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),
-    created_date_str = "2017-01-01T00:00:00.000Z, 2024-01-01T00:00:00.000Z"
+    added_time = datetime.datetime.now().strftime("%Y-%m-%d"),
+    created_date_str = "2017-01-01, 2024-01-01"
     with tarfile.open(input_path, "r") as tarf:
         for member in tarf.getmembers():
             iobytes = tarf.extractfile(member)

diff --git a/data-processing/scripts/convert_scandi_reddit_to_jsonlgz.py b/data-processing/scripts/convert_scandi_reddit_to_jsonlgz.py
@@ -15,7 +15,7 @@
 from datasets import Dataset, DatasetDict, load_dataset  # type: ignore
 
 reddit_time = "2005-12-01, 2022-11-01"
-date_added = datetime.datetime.utcnow().strftime("%Y-%m-%d")
+date_added = datetime.datetime.now().strftime("%Y-%m-%d")
 
 
 def reformat_dataset(ds: Dataset) -> Dataset:

diff --git a/data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py b/data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py
@@ -51,7 +51,7 @@ def main():
                             "text": entry["text"],
                             "source": "scandi-wiki",
                             "added": time_added,
-                            "created": wiki_created + "," + time_added,
+                            "created": wiki_created + ", " + time_added,
                             "metadata": {"url": entry["url"], "title": entry["title"], "language": Path(member.name).stem}
                         }
                         json.dump(output_entry, outfile)