Merge pull request #285 from centre-for-humanities-computing/fix_erro…

…rs_and_continue_fixing_timestamps fix timestamps for colossal_oscar_1_0, hplt, ncc
centre-for-humanities-computing · May 30, 2024 · 354ec78 · 354ec78
2 parents 6969071 + fde356e
commit 354ec78
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 7 deletions.
diff --git a/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py b/data-processing/scripts/convert_colossal_oscar_10_to_jsonlgz.py
@@ -89,12 +89,13 @@ def process_one(output_dir: Path, root_path: Path, input_path: Path) -> None:
                     out_fh.close()
                 out_fh = gzip.open(output_path, "wt")
             obj = json.loads(line)
+            created = datetime.datetime.strptime(obj["warc_headers"]["warc-date"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")
             new_obj: dict[str, Union[dict[str, Any], str]] = {
                 "id": str(obj["warc_headers"]["warc-record-id"]), # Copy metadata id to root
                 "text": obj["content"],
-                "source": "colossal_oscar_1.0",
+                "source": "colossal_oscar_1_0", # Make source name consistent with the dataset name: colossal_oscar_1_0
                 "added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d"),
-                "created": obj["warc_headers"]["warc-date"], # Copy metadata to root
+                "created": created + ", " + created,
                 "metadata": obj["metadata"],
             }
             new_obj["metadata"]["warc_headers"] = obj["warc_headers"] # type: ignore

diff --git a/data-processing/scripts/convert_hplt_to_jsonlgz.py b/data-processing/scripts/convert_hplt_to_jsonlgz.py
@@ -31,8 +31,10 @@ def process_one(output_dir: Path, input_path: Path) -> None:
             new_obj: dict[str, Union[dict[str, str], str]] = {
                 "id": str(obj["id"]),
                 "text": obj["text"],
-                "source": "hplt1.2",
-                "added": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+                # "source": "hplt1.2",
+                "source": "hplt",
+                "added": datetime.datetime.now().strftime("%Y-%m-%d"),
+                "created": "2000-01-01, 2024-05-30",
                 "metadata": {},
             }
             for key, val in obj.items():

diff --git a/data-processing/scripts/convert_ncc_to_jsonlgz.py b/data-processing/scripts/convert_ncc_to_jsonlgz.py
@@ -23,7 +23,7 @@
 
 
 EXPORT_PATH = "ncc.jsonl.gz"
-date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
+date_added = datetime.datetime.now().strftime("%Y-%m-%d")
 
 
 def convert_from_iterable_to_ds(iterable_ds: IterableDataset) -> Dataset:
@@ -89,9 +89,9 @@ def _structure_records(obs: dict) -> dict:
     obs = {
         "id": obs["id"],
         "text": obs["text"],
-        "source": "NCC",
+        "source": "ncc",
         "added": date_added,
-        "created": f"{publish_year}-01-01T00:00:00.000Z",
+        "created": f"{publish_year}-01-01, {date_added}",
         "metadata": {
             "doc_type": obs["doc_type"],
             "lang_fasttext": obs["lang_fasttext"],