Skip to content

Commit

Permalink
Merge pull request #45 from ljvmiranda921/add/ijelid
Browse files Browse the repository at this point in the history
Closes #29 | Add dataset loader for IJELID
  • Loading branch information
SamuelCahyawijaya authored Nov 20, 2023
2 parents aa35a9a + b472a1c commit 68e34f7
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 0 deletions.
Empty file.
142 changes: 142 additions & 0 deletions seacrowd/sea_datasets/ijelid/ijelid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
from datasets.download.download_manager import DownloadManager

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """
@article{hidayatullah2023corpus,
title={Corpus creation and language identification for code-mixed Indonesian-Javanese-English Tweets},
author={Hidayatullah, Ahmad Fathan and Apong, Rosyzie Anna and Lai, Daphne TC and Qazi, Atika},
journal={PeerJ Computer Science},
volume={9},
pages={e1312},
year={2023},
publisher={PeerJ Inc.}
}
"""

_LOCAL = False
_LANGUAGES = ["ind", "jav", "eng"]
_DATASETNAME = "ijelid"
_DESCRIPTION = """\
This is a code-mixed Indonesian-Javanese-English dataset for token-level
language identification. We named this dataset as IJELID
(Indonesian-Javanese-English Language Identification). This dataset contains
tweets that have been tokenized with the corresponding token and its language
label. There are seven language labels in the dataset, namely: ID (Indonesian)JV
(Javanese), EN (English), MIX_ID_EN (mixed Indonesian-English), MIX_ID_JV (mixed
Indonesian-Javanese), MIX_JV_EN (mixed Javanese-English), OTH (Other).
"""

_HOMEPAGE = "https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"
_LICENSE = Licenses.CC_BY_NC_SA_4_0.value
_URLS = {
"train": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/train.tsv",
"dev": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/val.tsv",
"test": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/test.tsv",
}

_SUPPORTED_TASKS = [Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"


class IJELIDDataset(datasets.GeneratorBasedBuilder):
"""IJELID dataset from https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

SEACROWD_SCHEMA_NAME = "seq_label"
LABEL_CLASSES = ["ID", "JV", "EN", "MIX_ID_EN", "MIX_ID_JV", "MIX_JV_EN", "OTH"]

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=_DATASETNAME,
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=_DATASETNAME,
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:
# No specific schema for the source, so for consistency,
# I will use the same schema with SEACrowd
features = schemas.seq_label_features(self.LABEL_CLASSES)

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
data_files = {
"train": Path(dl_manager.download_and_extract(_URLS["train"])),
"dev": Path(dl_manager.download_and_extract(_URLS["dev"])),
"test": Path(dl_manager.download_and_extract(_URLS["test"])),
}

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": data_files["train"], "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": data_files["dev"], "split": "dev"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filepath": data_files["test"], "split": "test"},
),
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yield examples as (key, example) tuples"""
with open(filepath, encoding="utf-8") as f:
guid = 0
tokens = []
labels = []
for line in f:
if line == "" or line == "\n":
if tokens:
yield guid, {
"id": str(guid),
"tokens": tokens,
"labels": labels,
}
guid += 1
tokens = []
labels = []
else:
# IJELID TSV are separated by \t
token, label = line.split("\t")
tokens.append(token)
labels.append(label.rstrip())

# Last example
if tokens:
yield guid, {
"id": str(guid),
"tokens": tokens,
"labels": labels,
}
2 changes: 2 additions & 0 deletions seacrowd/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class Tasks(Enum):
KEYWORD_TAGGING = "KT"
NAMED_ENTITY_RECOGNITION = "NER"
SENTENCE_ORDERING = "SO"
TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID"

# Pair Text Classification
QUESTION_ANSWERING = "QA"
Expand Down Expand Up @@ -189,6 +190,7 @@ class Licenses(Enum):
Tasks.POS_TAGGING: "SEQ_LABEL",
Tasks.KEYWORD_TAGGING: "SEQ_LABEL",
Tasks.SENTENCE_ORDERING: "SEQ_LABEL",
Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
Tasks.QUESTION_ANSWERING: "QA",
Tasks.TEXTUAL_ENTAILMENT: "PAIRS",
Tasks.SEMANTIC_SIMILARITY: "PAIRS_SCORE",
Expand Down

0 comments on commit 68e34f7

Please sign in to comment.