Skip to content

Commit

Permalink
Merge pull request #277 from centre-for-humanities-computing/fix_data…
Browse files Browse the repository at this point in the history
…sets

fix invalid datasets
  • Loading branch information
peterbjorgensen authored May 28, 2024
2 parents 57bd787 + 5af95ea commit 6393778
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 16 deletions.
6 changes: 3 additions & 3 deletions data-processing/scripts/convert_ai_aktindsigt_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def main():
print("Reshaping into dolma format")
df["id"] = df.apply(lambda row:'%s_%s' % (row.name[0],row.name[1]),axis=1)
df["sha512"] = df.apply(lambda row:'%s' % row.name[1],axis=1)
df["source"] = "AI-aktindsigt"
df["added"] = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%S.000Z")
df["created"]: "1970-01-01T00:00:00.000Z,2024-04-01T00:00:00.000Z" # best guess creation time, between 1970 and release time
df["source"] = "ai_aktindsigt"
df["added"] = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d")
df["created"] = "1970-01-01, 2024-04-01" # best guess creation time, between 1970 and release time

metadata_keys = ["url", "kommune", "sentence", "ppl_score", "sha512"]
df["metadata"] = df.apply(lambda row: {k: row[k] for k in metadata_keys}, axis=1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def process_one(output_dir: Path, root_path: Path, input_path: Path) -> None:
"id": str(obj["warc_headers"]["warc-record-id"]), # Copy metadata id to root
"text": obj["content"],
"source": "colossal_oscar_1.0",
"added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
"added": datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d"),
"created": obj["warc_headers"]["warc-date"], # Copy metadata to root
"metadata": obj["metadata"],
}
Expand Down
8 changes: 4 additions & 4 deletions data-processing/scripts/convert_dr_facebook_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from zoneinfo import ZoneInfo

def convert_file(input_path: Path, output_dir: Path):
added_time = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),
added_time = datetime.datetime.now().strftime("%Y-%m-%d"),
with tarfile.open(input_path, "r") as tarf:
for member in tarf.getmembers():
iobytes = tarf.extractfile(member)
Expand All @@ -19,9 +19,9 @@ def convert_file(input_path: Path, output_dir: Path):
reader = csv.DictReader(iotext)
for i, row in enumerate(reader):
tz = ZoneInfo("Europe/Copenhagen")
created_date = datetime.datetime.strptime(row["date"], "%Y-%m-%d %H:%M:%S")
tz_name = tz.tzname(created_date) or ""
created_date_str = created_date.strftime("%Y-%m-%dT%H:%M:%S.000") + tz_name
created_date = datetime.datetime.strptime(row["date"], "%Y-%m-%d")
# tz_name = tz.tzname(created_date) or ""
created_date_str = created_date.strftime("%Y-%m-%d") + ", " + added_time
new_obj = {
"id": str(i),
"text": row["text"],
Expand Down
8 changes: 4 additions & 4 deletions data-processing/scripts/convert_eur_lex_da_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
"""
import datetime

from datasets import Dataset, DatasetDict, load_dataset
from datasets import Dataset, DatasetDict, load_dataset # type: ignore

eu_start_time = "1993-11-01T00:00:00.000Z"
date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
eu_time_span = ",".join([eu_start_time, date_added])
eu_start_time = "1993-11-01"
date_added = datetime.datetime.now().strftime("%Y-%m-%d")
eu_time_span = ", ".join([eu_start_time, date_added])


def reformat_dataset(ds: Dataset) -> Dataset:
Expand Down
4 changes: 2 additions & 2 deletions data-processing/scripts/convert_ft_speech_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import sys

def convert_file(input_path: Path, output_dir: Path):
added_time = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),
created_date_str = "2017-01-01T00:00:00.000Z, 2024-01-01T00:00:00.000Z"
added_time = datetime.datetime.now().strftime("%Y-%m-%d"),
created_date_str = "2017-01-01, 2024-01-01"
with tarfile.open(input_path, "r") as tarf:
for member in tarf.getmembers():
iobytes = tarf.extractfile(member)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from datasets import Dataset, DatasetDict, load_dataset # type: ignore

reddit_time = "2005-12-01, 2022-11-01"
date_added = datetime.datetime.utcnow().strftime("%Y-%m-%d")
date_added = datetime.datetime.now().strftime("%Y-%m-%d")


def reformat_dataset(ds: Dataset) -> Dataset:
Expand Down
2 changes: 1 addition & 1 deletion data-processing/scripts/convert_scandi_wiki_to_jsonlgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def main():
"text": entry["text"],
"source": "scandi-wiki",
"added": time_added,
"created": wiki_created + "," + time_added,
"created": wiki_created + ", " + time_added,
"metadata": {"url": entry["url"], "title": entry["title"], "language": Path(member.name).stem}
}
json.dump(output_entry, outfile)
Expand Down

0 comments on commit 6393778

Please sign in to comment.