Skip to content

Commit

Permalink
Merge pull request #285 from centre-for-humanities-computing/fix_erro…
Browse files Browse the repository at this point in the history
…rs_and_continue_fixing_timestamps

fix timestamps for colossal_oscar_1_0, hplt, ncc
  • Loading branch information
TTTTao725 authored May 30, 2024
2 parents 6969071 + fde356e commit 354ec78
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,13 @@ def process_one(output_dir: Path, root_path: Path, input_path: Path) -> None:
out_fh.close()
out_fh = gzip.open(output_path, "wt")
obj = json.loads(line)
created = datetime.datetime.strptime(obj["warc_headers"]["warc-date"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")
new_obj: dict[str, Union[dict[str, Any], str]] = {
"id": str(obj["warc_headers"]["warc-record-id"]), # Copy metadata id to root
"text": obj["content"],
"source": "colossal_oscar_1.0",
"source": "colossal_oscar_1_0", # Make source name consistent with the dataset name: colossal_oscar_1_0
"added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d"),
"created": obj["warc_headers"]["warc-date"], # Copy metadata to root
"created": created + ", " + created,
"metadata": obj["metadata"],
}
new_obj["metadata"]["warc_headers"] = obj["warc_headers"] # type: ignore
Expand Down
6 changes: 4 additions & 2 deletions data-processing/scripts/convert_hplt_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ def process_one(output_dir: Path, input_path: Path) -> None:
new_obj: dict[str, Union[dict[str, str], str]] = {
"id": str(obj["id"]),
"text": obj["text"],
"source": "hplt1.2",
"added": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),
# "source": "hplt1.2",
"source": "hplt",
"added": datetime.datetime.now().strftime("%Y-%m-%d"),
"created": "2000-01-01, 2024-05-30",
"metadata": {},
}
for key, val in obj.items():
Expand Down
6 changes: 3 additions & 3 deletions data-processing/scripts/convert_ncc_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


EXPORT_PATH = "ncc.jsonl.gz"
date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
date_added = datetime.datetime.now().strftime("%Y-%m-%d")


def convert_from_iterable_to_ds(iterable_ds: IterableDataset) -> Dataset:
Expand Down Expand Up @@ -89,9 +89,9 @@ def _structure_records(obs: dict) -> dict:
obs = {
"id": obs["id"],
"text": obs["text"],
"source": "NCC",
"source": "ncc",
"added": date_added,
"created": f"{publish_year}-01-01T00:00:00.000Z",
"created": f"{publish_year}-01-01, {date_added}",
"metadata": {
"doc_type": obs["doc_type"],
"lang_fasttext": obs["lang_fasttext"],
Expand Down

0 comments on commit 354ec78

Please sign in to comment.