Update collection style format in multiple dataset JSON files

ARBML · Dec 26, 2024 · fb31995 · fb31995
1 parent 8e2ac63
commit fb31995
Show file tree

Hide file tree

Showing 208 changed files with 208 additions and 208 deletions.
diff --git a/README.md b/README.md
@@ -68,7 +68,7 @@ which gives the following output
  'Affiliations': ',The Islamic University of Gaza,,',
  'Authors': 'Chatrine Qwaider,Motaz Saad,S. Chatzikyriakidis,Simon Dobnik',
  'Citations': '25.0',
- 'Collection Style': 'crawling and annotation(other)',
+ 'Collection Style': 'crawling,annotation',
  'Cost': '',
  'Derived From': '',
  'Description': 'the first Levantine Dialect Corpus (SDC) covering data from the four dialects spoken in Palestine, Jordan, Lebanon and Syria.',

diff --git a/datasets/absa-hotels.json b/datasets/absa-hotels.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "web pages",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Around 15,562 Hotels' reviews were thoroughly reviewed by this research authors and a subset of 2,291 reviews were selected. The original dataset has been collected from well known Hotels' booking websites such as Booking.com, TripAdvisor.com.",
     "Volume": "24,028",
     "Unit": "sentences",

diff --git a/datasets/adi-17.json b/datasets/adi-17.json
@@ -112,7 +112,7 @@
     "Dialect": "mixed",
     "Domain": "transcribed audio",
     "Form": "spoken",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "dialect identification of speech from YouTube to one of the 17 dialects",
     "Volume": "3,091",
     "Unit": "hours",

diff --git a/datasets/adi-5.json b/datasets/adi-5.json
@@ -40,7 +40,7 @@
     "Dialect": "mixed",
     "Domain": "transcribed audio",
     "Form": "spoken",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "This will be divided across the five major Arabic dialects; Egyptian (EGY), Levantine (LAV), Gulf (GLF), North African (NOR), and Modern Standard Arabic (MSA)",
     "Volume": "50",
     "Unit": "hours",

diff --git a/datasets/adpbc.json b/datasets/adpbc.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "This corpus contains the words and their dependency relation produced by performing some steps",
     "Volume": "16",
     "Unit": "documents",

diff --git a/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json b/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Adult Content Detection on Arabic Twitter",
     "Volume": "50,000",
     "Unit": "sentences",

diff --git a/datasets/ajgt.json b/datasets/ajgt.json
@@ -9,7 +9,7 @@
     "Dialect": "Jordan",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.",
     "Volume": "1,800",
     "Unit": "sentences",

diff --git a/datasets/akec.json b/datasets/akec.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "news articles",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "The corpus consists in 160 arabic documents and their keyphrases.",
     "Volume": "160",
     "Unit": "documents",

diff --git a/datasets/alr__arabic_laptop_reviews_dataset.json b/datasets/alr__arabic_laptop_reviews_dataset.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "reviews",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Arabic  Laptops Reviews  (ALR) dataset focuses on laptops reviews written in Arabic",
     "Volume": "1,753",
     "Unit": "sentences",

diff --git a/datasets/amara.json b/datasets/amara.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(translation)",
+    "Collection Style": "crawling,annotation,machine translation",
     "Description": "multilingually aligned for 20 languages, i.e. 20 monolingual corpora and 190 parallel corpora",
     "Volume": "154,301",
     "Unit": "sentences",

diff --git a/datasets/anercorp.json b/datasets/anercorp.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "news articles",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "collected from different resources ",
     "Volume": "316",
     "Unit": "documents",

diff --git a/datasets/anetac.json b/datasets/anetac.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "English-Arabic named entity transliteration and classification dataset",
     "Volume": "79,924",
     "Unit": "sentences",

diff --git a/datasets/annotated_shami_corpus.json b/datasets/annotated_shami_corpus.json
@@ -9,7 +9,7 @@
     "Dialect": "Lebanon",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Subsection of the Lebanese portion of the Shami Corpus annotated for spelling standardization (CODA), morphological segmentation and tagging, and spontaneous orthography taxonomy tagging.",
     "Volume": "10,000",
     "Unit": "tokens",

diff --git a/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json b/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "In total, 17,103 sequences were annotated from 585,163 tweets (196,374 in English, 254,748 in French and 134,041 in Arabizi), including the themes \u201cOthers\u201d and \u201cIncomprehensible\u201d. Among these sequences, 4,578 sequences having at least 20 tweets annotated with the 3 predefined themes (Hooliganism, Racism and Terrorism) were obtained, including 1,866 sequences with an opinion change. They are distributed as follows: 2,141 sequences in English (57,655 tweets), 1,942 sequences in French (48,854 tweets) and 495 sequences in Arabizi (21,216 tweets). A sub-corpus of 8,733 tweets (1,209 in English, 3,938 in French and 3,585 in Arabizi) annotated as \u201chateful\u201d, according to topic/opinion annotations and by selecting tweets that contained insults, is also provided. ",
     "Volume": "134,041",
     "Unit": "sentences",

diff --git a/datasets/ans_corpus___claim_verification.json b/datasets/ans_corpus___claim_verification.json
@@ -16,7 +16,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "news articles",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence).",
     "Volume": "4,547",
     "Unit": "sentences",

diff --git a/datasets/anti-social_behaviour_in_online_communication.json b/datasets/anti-social_behaviour_in_online_communication.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "a corpus of 15,050 labelled YouTube comments in Arabic",
     "Volume": "15,050",
     "Unit": "sentences",

diff --git a/datasets/aoc-aldi.json b/datasets/aoc-aldi.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "commentary",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Comments to news articles with a continuous level of dialectness score between 0 and 1.",
     "Volume": "127,835",
     "Unit": "sentences",

diff --git a/datasets/aoc.json b/datasets/aoc.json
@@ -22,7 +22,7 @@
     "Dialect": "mixed",
     "Domain": "news articles",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "a 52M-word monolingual dataset rich in dialectal content",
     "Volume": "108,000",
     "Unit": "sentences",

diff --git a/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json b/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "a corpus designed to support research on gender bias in natural language processing applications working on Arabic",
     "Volume": "12,000",
     "Unit": "sentences",

diff --git a/datasets/aqmar.json b/datasets/aqmar.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "wikipedia",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "This is a 74,000-token corpus of 28 Arabic Wikipedia articles hand-annotated for named entities.",
     "Volume": "74,000",
     "Unit": "tokens",

diff --git a/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json b/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "A large corpus for generating Arabic word embeddings from multiple sources such as news articles, consumer reviews, Quran text, and tweets. The embeddings are used to perform sentiment analysis in both Standard and Dialectal Arabic without relying on hand-crafted features. The embeddings are applied to several binary classifiers to detect subjectivity and sentiment in Arabic texts.",
     "Volume": "190,000,000",
     "Unit": "tokens",

diff --git a/datasets/arab-acquis.json b/datasets/arab-acquis.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(translation)",
+    "Collection Style": "crawling,annotation,machine translation",
     "Description": "consists of over 12,000 sentences from the JRCAcquis (Acquis Communautaire) corpus ",
     "Volume": "12,000",
     "Unit": "sentences",

diff --git a/datasets/arab-esl.json b/datasets/arab-esl.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Emoji (the popular digital pictograms) are sometimes seen as a new kind of artificial and universally usable and consistent writing code. In spite of their assumed universality, there is some evidence that the sense of an emoji, specifically in regard to sentiment, may change from language to language and culture to culture. This paper investigates whether contextual emoji sentiment analysis is consistent across Arabic and European languages. To conduct this investigation, we, first, created the Arabic emoji sentiment lexicon (Arab-ESL). Then, we exploited an existing European emoji sentiment lexicon to compare the sentiment conveyed in each of the two families of language and culture (Arabic and European). The results show that the pairwise correlation between the two lexicons is consistent for emoji that represent, for instance, hearts, facial expressions, and body language. However, for a subset of emoji (those that represent objects, nature, symbols, and some human activities), there are large differences in the sentiment conveyed. More interestingly, an extremely high level of inconsistency has been shown with food emoji.",
     "Volume": "1,034",
     "Unit": "tokens",

diff --git a/datasets/arabic-dialect_english_parallel_text.json b/datasets/arabic-dialect_english_parallel_text.json
@@ -16,7 +16,7 @@
     "Dialect": "Levant",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(translation)",
+    "Collection Style": "crawling,annotation,machine translation",
     "Description": "it uses crowdsourcing to cheaply and quickly build LevantineEnglish and Egyptian-English parallel corpora, consisting of 1.1M words and 380k words, respectively.",
     "Volume": "1,500,000",
     "Unit": "tokens",

diff --git a/datasets/arabic-english_named_entities_dataset.json b/datasets/arabic-english_named_entities_dataset.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "news articles",
     "Form": "text",
-    "Collection Style": "crawling and annotation(translation)",
+    "Collection Style": "crawling,annotation,machine translation",
     "Description": "Arabic-ENglish named entities dataset is created using DBpedia Linked datasets and parallel corpus. For annotating NE in monolingual English corpus we used Gate tool. Our approach is based on linked data entities by mapping them to Gate Gazetteers, and then constructing a type-oriented NE base covering person, Location and organization classes. The second task consists of the use of machine translation to translate these entities and then finally, generating our NE lexicon that encloses the list of Arabic entities that match to the English lists.",
     "Volume": "48,753",
     "Unit": "tokens",

diff --git a/datasets/arabic_dialects_dataset.json b/datasets/arabic_dialects_dataset.json
@@ -40,7 +40,7 @@
     "Dialect": "mixed",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Dataset of Arabic dialects for GULF, EGYPT, LEVANT, TONESIAN Arabic dialects in addition to MSA.",
     "Volume": "16,494",
     "Unit": "sentences",

diff --git a/datasets/arabic_flood_twitter_dataset.json b/datasets/arabic_flood_twitter_dataset.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "It includes 4,037 human-labelled Arabic Twitter messages for four high-risk flood events that occurred in 2018",
     "Volume": "4,037",
     "Unit": "sentences",

diff --git a/datasets/arabic_hate_speech_2022_shared_task.json b/datasets/arabic_hate_speech_2022_shared_task.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "largest Arabic dataset for offensive, fine-grained hate speech, vulgar and violence content",
     "Volume": "12,698",
     "Unit": "sentences",

diff --git a/datasets/arabic_keyphrase_dataset.json b/datasets/arabic_keyphrase_dataset.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "news articles",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "A dataset in Arabic language for automatic keyphrase extraction algorithms",
     "Volume": "400",
     "Unit": "documents",

diff --git a/datasets/arabic_named_entities.json b/datasets/arabic_named_entities.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "we have extracted\r\napproximately 45,000 Arabic NE",
     "Volume": "45,000",
     "Unit": "tokens",

diff --git a/datasets/arabic_named_entity_gazetteer.json b/datasets/arabic_named_entity_gazetteer.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "wikipedia",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "A gazetteer of entities curated from Wikipedia.",
     "Volume": "68,355",
     "Unit": "tokens",

diff --git a/datasets/arabic_news_dataset_about_hajj.json b/datasets/arabic_news_dataset_about_hajj.json
@@ -9,7 +9,7 @@
     "Dialect": "Classical Arabic",
     "Domain": "news articles",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "more than 2k articles about Hajj ",
     "Volume": "2,000",
     "Unit": "documents",

diff --git a/datasets/arabic_news_tweets.json b/datasets/arabic_news_tweets.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "This dataset is a relatively great size collection of Arabic news tweets that were collected from an official and verified users in Twitter. All news that is collected from the most popular and official users in Saudi Arabia belongs to Saudi Arabia news. All data that is gathered was retrieved using specific time period and collected all news in that time. To the best of our knowledge, this dataset is the first Arabic news data collection that does not specify by keywords and belongs to Saudi Arabia. This news dataset can be valuable for diverse tasks in NLP, such as text classification and automated verification system. The dataset has been categorized into 5 different news classes which are general news, regions news, sport news, economic news, and quality life news. In this data article, 89,179 original tweets have presented and fully labeled into related categories.",
     "Volume": "89,179",
     "Unit": "sentences",

diff --git a/datasets/arabic_osact4___offensive_language_detection.json b/datasets/arabic_osact4___offensive_language_detection.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "OSACT4 Shared Task on Offensive Language Detection",
     "Volume": "8,000",
     "Unit": "sentences",

diff --git a/datasets/arabic_osact5___arabic_hate_speech.json b/datasets/arabic_osact5___arabic_hate_speech.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Fine-Grained Hate Speech Detection on Arabic Twitter",
     "Volume": "10,157",
     "Unit": "sentences",

diff --git a/datasets/arabic_pos_dialect.json b/datasets/arabic_pos_dialect.json
@@ -34,7 +34,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "includes tweets in Egyptian, Levantine, Gulf, and Maghrebi, with 350 tweets for each dialect with appropriate train/test/development splits for 5-fold cross validation",
     "Volume": "1,400",
     "Unit": "sentences",

diff --git a/datasets/arabic_punctuation_dataset.json b/datasets/arabic_punctuation_dataset.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "books",
     "Form": "text",
-    "Collection Style": "crawling and annotation(translation)",
+    "Collection Style": "crawling,annotation,machine translation",
     "Description": "This is a curated dataset, specifically designed to facilitate the study of punctuation. It has undergone rigorous manual annotation and verification on the basis of sentence structure, with sentence boundaries clearly marked. ",
     "Volume": "12,183,000",
     "Unit": "sentences",

diff --git a/datasets/arabic_rc_datasets.json b/datasets/arabic_rc_datasets.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "Arabic Reading Comprehension Benchmarks Created Semiautomatically",
     "Volume": "2,862",
     "Unit": "sentences",

diff --git a/datasets/arabic_satire_dataset.json b/datasets/arabic_satire_dataset.json
@@ -9,7 +9,7 @@
     "Dialect": "Classical Arabic",
     "Domain": "other",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "500 Arabic news and 500 Arabic satire articles ",
     "Volume": "1,000",
     "Unit": "sentences",

diff --git a/datasets/arabic_sentiment_lexicons.json b/datasets/arabic_sentiment_lexicons.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(translation)",
+    "Collection Style": "crawling,annotation,machine translation",
     "Description": " by using distant supervision techniques on Arabic tweets, and by translating English sentiment lexicons into Arabic using a freely available statistical machine translation system",
     "Volume": "176,364",
     "Unit": "tokens",

diff --git a/datasets/arabic_sentiment_twitter_corpus.json b/datasets/arabic_sentiment_twitter_corpus.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "A Sentiment Analysis dataset. No extra information is provided regarding the dialects nor the collection methodology",
     "Volume": "58,000",
     "Unit": "sentences",

diff --git a/datasets/arabic_spam_and_ham_tweets.json b/datasets/arabic_spam_and_ham_tweets.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "The dataset contains 13241 records. Each record represents a tweet. The tweets are labeled either Ham or Spam. Ham means non-spam tweet. There are 1924 Spam tweets and 11299 Ham tweets. The tweets are unique i.e. there are no repeated tweets records.",
     "Volume": "13,241",
     "Unit": "sentences",

diff --git a/datasets/arabic_tweets_about_infectious_diseases.json b/datasets/arabic_tweets_about_infectious_diseases.json
@@ -9,7 +9,7 @@
     "Dialect": "mixed",
     "Domain": "social media",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "This file contains a dataset of 1266 tweets by two Arabic native speakers into five types of sources: academic, media, government, health professional, and public.",
     "Volume": "1,266",
     "Unit": "sentences",

diff --git a/datasets/arabic_wikireading_and_kaiflematha.json b/datasets/arabic_wikireading_and_kaiflematha.json
@@ -9,7 +9,7 @@
     "Dialect": "Modern Standard Arabic",
     "Domain": "wikipedia",
     "Form": "text",
-    "Collection Style": "crawling and annotation(other)",
+    "Collection Style": "crawling,annotation",
     "Description": "high quality and large-scale Arabic reading comprehension datasets: Arabic WikiReading and KaifLematha with around +100 K instances.",
     "Volume": "100,000",
     "Unit": "documents",