From 71a16031c67ef4ef6e178c7cfc80f127a0bfe2a2 Mon Sep 17 00:00:00 2001 From: akhdanfadh Date: Tue, 2 Apr 2024 19:44:08 +0900 Subject: [PATCH 1/3] add image classification schema --- seacrowd/utils/constants.py | 10 +++++++++ seacrowd/utils/schemas/__init__.py | 4 ++++ seacrowd/utils/schemas/image.py | 35 ++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 seacrowd/utils/schemas/image.py diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index eea6ee1ca..1110a0651 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -10,6 +10,8 @@ pairs_features_score, pairs_multi_features, qa_features, + image_features, + image_multi_features, imqa_features, seq_label_features, speech2speech_features, @@ -121,6 +123,10 @@ class Tasks(Enum): # SpeechSpeech SPEECH_TO_SPEECH_TRANSLATION = "S2ST" + # Image + IMAGE_CLASSIFICATION = "IMC" + IMAGE_CLASSIFICATION_MULTILABEL = "IMC_MULTI" + # ImageText IMAGE_CAPTIONING = "IC" VISUAL_QUESTION_ANSWERING = "VQA" @@ -281,6 +287,8 @@ class Licenses(Enum): Tasks.SPEECH_EMOTION_RECOGNITION: "SPEECH", Tasks.SPEECH_EMOTION_RECOGNITION_MULTILABEL: "SPEECH_MULTI", Tasks.VISUAL_QUESTION_ANSWERING: "IMQA", + Tasks.IMAGE_CLASSIFICATION: "IMAGE", + Tasks.IMAGE_CLASSIFICATION_MULTILABEL: "IMAGE_MULTI", Tasks.IMAGE_CAPTIONING: "IMTEXT", Tasks.SIGN_LANGUAGE_RECOGNITION: "IMTEXT", Tasks.OPTICAL_CHARACTER_RECOGNITION: "IMTEXT", @@ -317,6 +325,8 @@ class Licenses(Enum): "S2S": speech2speech_features, "SPEECH": speech_features(), "SPEECH_MULTI": speech_multi_features(), + "IMAGE": image_features(), + "IMAGE_MULTI": image_multi_features(), "IMTEXT": image_text_features(), "IMQA": imqa_features, "VIDTEXT": video_features, diff --git a/seacrowd/utils/schemas/__init__.py b/seacrowd/utils/schemas/__init__.py index f53a83890..ec4c035f8 100644 --- a/seacrowd/utils/schemas/__init__.py +++ b/seacrowd/utils/schemas/__init__.py @@ -5,6 +5,8 @@ from .pairs import features_with_continuous_label as pairs_features_score from .pairs_multilabel import features as pairs_multi_features from .qa import features as qa_features +from .image import features as image_features +from .image import multi_features as image_multi_features from .imqa import features as imqa_features from .self_supervised_pretraining import features as ssp_features from .seq_label import features as seq_label_features @@ -26,6 +28,8 @@ "pairs_features_score", "pairs_multi_features", "qa_features", + "image_features", + "image_multi_features", "imqa_features", "ssp_features", "seq_label_features", diff --git a/seacrowd/utils/schemas/image.py b/seacrowd/utils/schemas/image.py new file mode 100644 index 000000000..5b5ba0aa9 --- /dev/null +++ b/seacrowd/utils/schemas/image.py @@ -0,0 +1,35 @@ +""" +General Image Classification Schema + +The field "metadata" is not specified to allow some flexibility. +On how to use "metadata", choose one: +1. defining as empty dict if you don't think it's usable in + `_generate_examples`, or +2. defining meta as dict of key with intended colname meta and its val with + dataset.Features class in `_info` Dataloader method then populate it with the + values in `_general_examples` Dataloader method +""" + +import datasets + + +def features(label_names=["Yes", "No"]): + return datasets.Features( + { + "id": datasets.Value("string"), + "labels": datasets.ClassLabel(names=label_names), + "image_path": datasets.Value("string"), + "metadata": {}, + } + ) + + +def multi_features(label_names=["Yes", "No"]): + return datasets.Features( + { + "id": datasets.Value("string"), + "labels": datasets.Sequence(datasets.ClassLabel(names=label_names)), + "image_path": datasets.Value("string"), + "metadata": {}, + } + ) From fc1499e207925219326a6a89e43634ce01c49e7a Mon Sep 17 00:00:00 2001 From: akhdanfadh Date: Tue, 2 Apr 2024 19:44:26 +0900 Subject: [PATCH 2/3] add dataloader --- .../total_defense_meme/__init__.py | 0 .../total_defense_meme/total_defense_meme.py | 277 ++++++++++++++++++ 2 files changed, 277 insertions(+) create mode 100644 seacrowd/sea_datasets/total_defense_meme/__init__.py create mode 100644 seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py diff --git a/seacrowd/sea_datasets/total_defense_meme/__init__.py b/seacrowd/sea_datasets/total_defense_meme/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py b/seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py new file mode 100644 index 000000000..1bd9834c7 --- /dev/null +++ b/seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import gdown + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = """\ +@inproceedings{10.1145/3587819.3592545, + author = {Prakash, Nirmalendu and Hee, Ming Shan and Lee, Roy Ka-Wei}, + title = {TotalDefMeme: A Multi-Attribute Meme dataset on Total Defence in Singapore}, + year = {2023}, + isbn = {9798400701481}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + url = {https://doi.org/10.1145/3587819.3592545}, + doi = {10.1145/3587819.3592545}, + booktitle = {Proceedings of the 14th Conference on ACM Multimedia Systems}, + pages = {369–375}, + numpages = {7}, + keywords = {multimodal, meme, dataset, topic clustering, stance classification}, + location = {Vancouver, BC, Canada}, + series = {MMSys '23} +} +""" + +_DATASETNAME = "total_defense_meme" + +_DESCRIPTION = """\ +This is a large-scale multimodal and multi-attribute dataset containing memes +about Singapore's Total Defence policy from different social media platforms. +The type (Singaporean or generic), pillars (military, civil, economic, social, +psychological, digital, others), topics and stances (against, neutral, +supportive) of each meme are manually identified by annotators. +""" + +_HOMEPAGE = "https://gitlab.com/bottle_shop/meme/TotalDefMemes" + +_LANGUAGES = ["eng"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + "image": "https://drive.google.com/file/d/1oJIh4QQS3Idff2g6bZORstS5uBROjUUz/view?usp=share_link", + "annotations": "https://gitlab.com/bottle_shop/meme/TotalDefMemes/-/raw/main/report/annotation.json?ref_type=heads", +} + +_SUPPORTED_TASKS = [Tasks.OPTICAL_CHARACTER_RECOGNITION, Tasks.IMAGE_CLASSIFICATION_MULTILABEL] +_SEACROWD_SCHEMA = { + task.value: f"seacrowd_{TASK_TO_SCHEMA[task].lower()}" for task in _SUPPORTED_TASKS +} # ocr: imtext, imc_multi: image_multi + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class TotalDefenseMemeDataset(datasets.GeneratorBasedBuilder): + """Multimodal dataset containing memes about Singapore's Total Defence policy""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SEACROWD_SCHEMA['OCR']}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=_SEACROWD_SCHEMA["OCR"], + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SEACROWD_SCHEMA['IMC_MULTI']}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=_SEACROWD_SCHEMA["IMC_MULTI"], + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + # define labelling + meme_type = ["Non_Memes", "Non_SG_Memes", "SG_Memes"] + pillar_type = [ + "Social", + "Economic", + "Psychological", + "Military", + "Civil", + "Digital", + "Others", + ] + stance_type = ["Against", "Neutral", "Supportive"] + + if self.config.schema == "source": + features = datasets.Features( + { + "image_path": datasets.Value("string"), + "categories": datasets.Sequence(datasets.ClassLabel(names=meme_type)), + "text": datasets.Value("string"), + "tags": datasets.Sequence(datasets.Value("string")), + "pillar_stances": datasets.Sequence( + { + "category": datasets.ClassLabel(names=pillar_type), + "stance": datasets.Sequence(datasets.ClassLabel(names=stance_type)), + } + ), + } + ) + + elif self.config.schema == _SEACROWD_SCHEMA["OCR"]: # all images + features = schemas.image_text_features(label_names=meme_type) + features["metadata"] = { + "tags": datasets.Sequence(datasets.Value("string")), + "pillar_stances": datasets.Sequence( + { + "category": datasets.ClassLabel(names=pillar_type), + "stance": datasets.Sequence(datasets.ClassLabel(names=stance_type)), + } + ), + } + elif self.config.schema == _SEACROWD_SCHEMA["IMC_MULTI"]: # sg meme images only + features = schemas.image_multi_features(label_names=pillar_type) + features["metadata"] = { + "tags": datasets.Sequence(datasets.Value("string")), + "stances": datasets.Sequence(datasets.Sequence(datasets.ClassLabel(names=stance_type))), + } + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # download image from gdrive + output_dir = Path.cwd() / "data" / _DATASETNAME + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / f"{_DATASETNAME}.zip" + if not output_file.exists(): + gdown.download(_URLS["image"], str(output_file), fuzzy=True) + else: + print(f"File already downloaded: {str(output_file)}") + # extract image data + image_dir = Path(dl_manager.extract(output_file)) / "TD_Memes" + + # download annotations + annotation_path = Path(dl_manager.download(_URLS["annotations"])) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "image_dir": image_dir, + "annotation_file": annotation_path, + }, + ), + ] + + def _generate_examples(self, image_dir: Path, annotation_file: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # load annotation + with open(annotation_file, "r", encoding="utf-8") as file: + annotation = json.load(file) + + # get unique image names + image_names = sorted( + list( + set(annotation["Non_Memes"]) + | set(annotation["Non_SG_Memes"]) + | set(annotation["SG_Memes"]) + ) + ) + + # annotation data is a list of dict, instead of dict of image names + def get_value(image_name, list_of_dicts): + for dictionary in list_of_dicts: + if image_name in dictionary: + return dictionary[image_name] + return None + + key = 0 + for image_name in image_names: + # assert image exist in directory + assert (image_dir / image_name).exists(), f"Image {image_name} not found" + image_path = str(image_dir / image_name) + + # get categories, can be multiple + categories = [] + if image_name in annotation["Non_Memes"]: + categories.append("Non_Memes") + if image_name in annotation["Non_SG_Memes"]: + categories.append("Non_SG_Memes") + if image_name in annotation["SG_Memes"]: + categories.append("SG_Memes") + + # get attributes + text = get_value(image_name, annotation["Text"]) + tags = get_value(image_name, annotation["Tags"]) + raw_pillar_stances = get_value(image_name, annotation["Pillar_Stances"]) + + # process pillar stances + pillar_stances = [] + if raw_pillar_stances: + for pillar, stances in raw_pillar_stances: + category = pillar.split(" ")[0] + pillar_stances.append({"category": category, "stance": stances}) + + # source schema + if self.config.schema == "source": + yield key, { + "image_path": image_path, + "categories": categories, + "text": text, + "tags": tags, + "pillar_stances": pillar_stances, + } + key += 1 + + # ocr seacrowd schema + elif self.config.schema == _SEACROWD_SCHEMA["OCR"]: + yield key, { + "id": str(key), + "image_paths": [image_path], + "texts": text, + "metadata": { + "tags": tags, + "pillar_stances": pillar_stances, + }, + } + key += 1 + + # pillar classification seacrowd schema + elif self.config.schema == _SEACROWD_SCHEMA["IMC_MULTI"]: + if pillar_stances: # only those with pillar stances + yield key, { + "id": str(key), + "labels": [pillar["category"] for pillar in pillar_stances], + "image_path": image_path, + "metadata": { + "tags": tags, + "stances": [pillar["stance"] for pillar in pillar_stances], + }, + } + key += 1 From f07c6babb621e3bd7b2619f0942a7b04884fd686 Mon Sep 17 00:00:00 2001 From: akhdanfadh Date: Thu, 18 Apr 2024 13:37:31 +0900 Subject: [PATCH 3/3] change source feature, modify comment --- .../sea_datasets/total_defense_meme/total_defense_meme.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py b/seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py index 1bd9834c7..e9dbded2e 100644 --- a/seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py +++ b/seacrowd/sea_datasets/total_defense_meme/total_defense_meme.py @@ -126,7 +126,7 @@ def _info(self) -> datasets.DatasetInfo: features = datasets.Features( { "image_path": datasets.Value("string"), - "categories": datasets.Sequence(datasets.ClassLabel(names=meme_type)), + "meme_type": datasets.Sequence(datasets.ClassLabel(names=meme_type)), "text": datasets.Value("string"), "tags": datasets.Sequence(datasets.Value("string")), "pillar_stances": datasets.Sequence( @@ -242,7 +242,7 @@ def get_value(image_name, list_of_dicts): if self.config.schema == "source": yield key, { "image_path": image_path, - "categories": categories, + "meme_type": categories, "text": text, "tags": tags, "pillar_stances": pillar_stances, @@ -262,7 +262,7 @@ def get_value(image_name, list_of_dicts): } key += 1 - # pillar classification seacrowd schema + # pillar/topic classification seacrowd schema elif self.config.schema == _SEACROWD_SCHEMA["IMC_MULTI"]: if pillar_stances: # only those with pillar stances yield key, {