Skip to content

Commit

Permalink
Merge pull request #282 from centre-for-humanities-computing/convert_…
Browse files Browse the repository at this point in the history
…domsdatabasen

Convert domsdatabasen
  • Loading branch information
TTTTao725 authored May 30, 2024
2 parents d3b1c2e + 6b2da11 commit 6969071
Showing 1 changed file with 87 additions and 0 deletions.
87 changes: 87 additions & 0 deletions data-processing/scripts/convert_domsdatabasen_to_jsonlgz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
downloads dataset and save it as jsonl.gz file with the format:
{
"id": "...", # MANDATORY: source-specific identifier
"text": "foo", # MANDATORY: textual content of the document
"source": "...", # MANDATORY: source of the data, such as peS2o, common-crawl, etc.
"added": "...", # OPTIONAL: timestamp ai2 acquired this data
"created": "..." # OPTIONAL: timestamp when orig document was created (best-guess if not available)
"metadata": {...} # OPTIONAL: source-specific metadata
}
The dataset contains reference and summaries. As we use this dataset for pretraining we
concatenate reference and summary
"""
import datetime

from datasets import Dataset, DatasetDict, load_dataset

oldest_case = "1855-02-28"
date_added = datetime.datetime.now().strftime("%Y-%m-%d")
case_time_span = ", ".join([oldest_case, date_added])


def reformat_dataset(ds: Dataset) -> Dataset:
# current keys:
# ['case_id', 'Overskrift', 'Afgørelsesstatus', 'Faggruppe', 'Ret', 'Rettens sagsnummer', 'Sagstype', 'Instans', 'Domsdatabasens sagsnummer', 'Sagsemner', 'Særlige retsskridt', 'Sagsdeltagere', 'Dørlukning', 'Løftet ud af småsagsprocessen', 'Anerkendelsespåstand', 'Politiets journalnummer', 'Påstandsbeløb', 'Sagskomplekser', 'text', 'text_anonymized', 'text_len', 'text_anon_len']

# rename celex_id to id
ds = ds.rename_column("case_id", "id")

# add source column
source_column = ["domsdatabasen"] * len(ds) # type: ignore
ds = ds.add_column("source", source_column) # type: ignore

# add created column
created_column = [case_time_span] * len(ds) # type: ignore
ds = ds.add_column("created", created_column) # type: ignore

# add added column
added_column = [date_added] * len(ds) # type: ignore
ds = ds.add_column("added", added_column) # type: ignore

metadata_keys = ['Overskrift', 'Afgørelsesstatus', 'Faggruppe', 'Ret', 'Rettens sagsnummer', 'Sagstype', 'Instans', 'Domsdatabasens sagsnummer', 'Sagsemner', 'Særlige retsskridt', 'Sagsdeltagere', 'Dørlukning', 'Løftet ud af småsagsprocessen', 'Anerkendelsespåstand', 'Politiets journalnummer', 'Påstandsbeløb', 'Sagskomplekser', 'text_len']
# add metadata
ds = ds.map( # type: ignore
lambda x: { # type: ignore
"text": x["text"],
"metadata": {
k: x[k] for k in metadata_keys # type: ignore
}
},
) # type: ignore
ds = ds.remove_columns(["text_anonymized", "text_anon_len"]) # type: ignore
ds = ds.remove_columns(metadata_keys) # type: ignore

return ds # type: ignore


def main():
ds = load_dataset("alexandrainst/domsdatabasen")
assert isinstance(ds, DatasetDict)
# We take only the train dataset in case this dataset is used for model evaulation
ds = ds["train"]
assert isinstance(ds, Dataset)

# reformat
ds = reformat_dataset(ds)

# save to jsonl.gz
ds.to_json("domsdatabasen.jsonl.gz", orient="records", lines=True, compression="gzip") # type: ignore


if __name__ == "__main__":
main()

# # test that it load back in
ds = load_dataset("json", data_files="domsdatabasen.jsonl.gz", split="train")
assert isinstance(ds[0], dict) # type: ignore

# test that it can be streamed
ds = load_dataset(
"json",
data_files="domsdatabasen.jsonl.gz",
split="train",
streaming=True,
)
example = next(iter(ds)) # type: ignore
assert isinstance(example, dict)

0 comments on commit 6969071

Please sign in to comment.