From fb31995d6a6fdfc551b3669916bf470599532982 Mon Sep 17 00:00:00 2001 From: zaidalyafeai Date: Thu, 26 Dec 2024 12:18:52 +0300 Subject: [PATCH] Update collection style format in multiple dataset JSON files --- README.md | 2 +- datasets/absa-hotels.json | 2 +- datasets/adi-17.json | 2 +- datasets/adi-5.json | 2 +- datasets/adpbc.json | 2 +- ...t_detection_on_arabic_twitter__analysis_and_experiments.json | 2 +- datasets/ajgt.json | 2 +- datasets/akec.json | 2 +- datasets/alr__arabic_laptop_reviews_dataset.json | 2 +- datasets/amara.json | 2 +- datasets/anercorp.json | 2 +- datasets/anetac.json | 2 +- datasets/annotated_shami_corpus.json | 2 +- .../annotated_tweet_corpus_in_arabizi,_french_and_english.json | 2 +- datasets/ans_corpus___claim_verification.json | 2 +- datasets/anti-social_behaviour_in_online_communication.json | 2 +- datasets/aoc-aldi.json | 2 +- datasets/aoc.json | 2 +- datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json | 2 +- datasets/aqmar.json | 2 +- ...biddings__arabic_word_embeddings_for_sentiment_analysis.json | 2 +- datasets/arab-acquis.json | 2 +- datasets/arab-esl.json | 2 +- datasets/arabic-dialect_english_parallel_text.json | 2 +- datasets/arabic-english_named_entities_dataset.json | 2 +- datasets/arabic_dialects_dataset.json | 2 +- datasets/arabic_flood_twitter_dataset.json | 2 +- datasets/arabic_hate_speech_2022_shared_task.json | 2 +- datasets/arabic_keyphrase_dataset.json | 2 +- datasets/arabic_named_entities.json | 2 +- datasets/arabic_named_entity_gazetteer.json | 2 +- datasets/arabic_news_dataset_about_hajj.json | 2 +- datasets/arabic_news_tweets.json | 2 +- datasets/arabic_osact4___offensive_language_detection.json | 2 +- datasets/arabic_osact5___arabic_hate_speech.json | 2 +- datasets/arabic_pos_dialect.json | 2 +- datasets/arabic_punctuation_dataset.json | 2 +- datasets/arabic_rc_datasets.json | 2 +- datasets/arabic_satire_dataset.json | 2 +- datasets/arabic_sentiment_lexicons.json | 2 +- datasets/arabic_sentiment_twitter_corpus.json | 2 +- datasets/arabic_spam_and_ham_tweets.json | 2 +- datasets/arabic_tweets_about_infectious_diseases.json | 2 +- datasets/arabic_wikireading_and_kaiflematha.json | 2 +- datasets/arabicaqa.json | 2 +- datasets/arabichatespeechdataset.json | 2 +- datasets/arabicmmlu.json | 2 +- datasets/arabicsa.json | 2 +- datasets/aracovid19-mfh.json | 2 +- ...arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json | 2 +- datasets/aracust.json | 2 +- datasets/arafacts.json | 2 +- datasets/aranews.json | 2 +- datasets/arap-tweet_corpus.json | 2 +- datasets/arasencorpus.json | 2 +- datasets/arasenti.json | 2 +- datasets/arastance.json | 2 +- datasets/arc-wmi.json | 2 +- datasets/arcd.json | 2 +- ...ts_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json | 2 +- datasets/arcov-19.json | 2 +- datasets/arcov19-rumors.json | 2 +- datasets/arcovidvac.json | 2 +- datasets/arentail.json | 2 +- datasets/armi__arabic_misogynistic_dataset.json | 2 +- datasets/arparallel.json | 2 +- datasets/arquad.json | 2 +- datasets/arsarcasm-v2.json | 2 +- datasets/arsarcasm.json | 2 +- datasets/arsas.json | 2 +- datasets/arsen-20.json | 2 +- datasets/arsentd-lev.json | 2 +- datasets/artest.json | 2 +- datasets/artrivia.json | 2 +- datasets/asad.json | 2 +- datasets/astad.json | 2 +- datasets/astd.json | 2 +- datasets/at-odtsa.json | 2 +- datasets/attimam.json | 2 +- datasets/author_attribution_tweets.json | 2 +- datasets/autotweet.json | 2 +- datasets/baec.json | 2 +- datasets/bbn_blog_posts_sentiment_corpus.json | 2 +- datasets/brad_1_0.json | 2 +- datasets/calliar.json | 2 +- datasets/checkthat-ar.json | 2 +- datasets/cidar.json | 2 +- datasets/comparable_wikipedia_coprus.json | 2 +- ...orpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json | 2 +- datasets/corpus_of_offensive_language_in_arabic.json | 2 +- ...vid-19_disinfo__covid-19_disinformation_twitter_dataset.json | 2 +- datasets/covid-fakes.json | 2 +- datasets/covost_2.json | 2 +- datasets/cqa-md__semeval-2016_task_3.json | 2 +- datasets/daict.json | 2 +- datasets/dart.json | 2 +- ...qas__a_dataset_for_arabic_why_question_answering_system.json | 2 +- datasets/defarabicqa.json | 2 +- datasets/dziribert.json | 2 +- datasets/edgad.json | 2 +- datasets/emotional-tone.json | 2 +- datasets/evetar.json | 2 +- datasets/flodusta.json | 2 +- datasets/flores-101.json | 2 +- datasets/gem.json | 2 +- datasets/gem_-_wikilingua.json | 2 +- datasets/gem_-_xlsum.json | 2 +- datasets/gumar.json | 2 +- datasets/haad.json | 2 +- datasets/idat.json | 2 +- datasets/idrisi-r.json | 2 +- datasets/isarcasmeval__semeval-2022_task_6.json | 2 +- datasets/kalamdz.json | 2 +- datasets/kawarith.json | 2 +- datasets/ksaa-rd_dataset.json | 2 +- datasets/ksu_rich_arabic_speech_database.json | 2 +- datasets/l-hsab.json | 2 +- datasets/labr.json | 2 +- datasets/lama.json | 2 +- ...ge_multi-domain_resources_for_arabic_sentiment_analysis.json | 2 +- datasets/let-mi.json | 2 +- datasets/lince_-_msa-da__(lid_-_code_switching_).json | 2 +- datasets/lince_-_msa-egy_(ner_-_code_switching).json | 2 +- datasets/lisan.json | 2 +- ...a__multi-domain_arabic_resources_for_sentiment_analysis.json | 2 +- datasets/masc.json | 2 +- datasets/masc__massive_arabic_speech_corpus.json | 2 +- datasets/mawqif.json | 2 +- datasets/mcwc.json | 2 +- datasets/mediaspeech.json | 2 +- datasets/mega-cov.json | 2 +- datasets/mgb-2.json | 2 +- datasets/mgb-3.json | 2 +- datasets/mgb-5.json | 2 +- datasets/mlma_hate_speech.json | 2 +- datasets/mlqa.json | 2 +- datasets/mpold__multi_platforms_offensive_language_dataset.json | 2 +- datasets/msac.json | 2 +- datasets/msda.json | 2 +- "datasets/multilingual_hate\r\nspeech_detection_dataset.json" | 2 +- datasets/nadi-2020.json | 2 +- datasets/nadi-2021.json | 2 +- datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json | 2 +- datasets/named_entities_lexicon.json | 2 +- datasets/narabizi_corpus.json | 2 +- datasets/narabizi_treebank.json | 2 +- datasets/ne3l__named_entities_arabic_corpus.json | 2 +- datasets/nileulex.json | 2 +- datasets/nlp_dataset_for_arabic_dialects.json | 2 +- datasets/oca__opinion_corpus_for_arabic.json | 2 +- datasets/oclar.json | 2 +- datasets/offenseval_2020.json | 2 +- datasets/omcca.json | 2 +- datasets/ontonotes_5_0.json | 2 +- datasets/openiti-proc.json | 2 +- datasets/opus_ubuntu.json | 2 +- datasets/paad.json | 2 +- datasets/padic__parallel_arabic_dialect_corpus.json | 2 +- datasets/pan17_author_profiling.json | 2 +- datasets/pan18_author_profiling.json | 2 +- ...rabic_intrinsic_plagiarism_detection_shared_task_corpus.json | 2 +- datasets/polyglot-ner.json | 2 +- datasets/qa4mre.json | 2 +- datasets/qadi_arabic.json | 2 +- datasets/qasr.json | 2 +- datasets/qatari_heritage_corpus.json | 2 +- datasets/quran_hadith_datasets.json | 2 +- datasets/quran_speech__imam_+_users.json | 2 +- datasets/religious_hate_speech.json | 2 +- datasets/rsac.json | 2 +- datasets/sa`7r.json | 2 +- datasets/salma.json | 2 +- datasets/sanad.json | 2 +- datasets/semeval-2017_task_4.json | 2 +- datasets/semeval-2018_task_1.json | 2 +- datasets/senti_lex.json | 2 +- datasets/senwave.json | 2 +- datasets/senzi.json | 2 +- datasets/shami.json | 2 +- datasets/sohateful.json | 2 +- datasets/spiral.json | 2 +- .../sudanese_dialect_tweets_about_ridesharing_companies.json | 2 +- ...danese_dialect_tweets_about_telecommunication_companies.json | 2 +- ...se_arabic_telcom_sentiment_classification_pre_processed.json | 2 +- datasets/syria_tweets_sentiment_corpus.json | 2 +- datasets/tarc.json | 2 +- datasets/tead.json | 2 +- datasets/ted_talks_corpus_(wit3).json | 2 +- datasets/the_nine_books_of_arabic_hadith.json | 2 +- datasets/toxi-text-3m.json | 2 +- ...english_mailing_lists_parallel_corpus_-_development_set.json | 2 +- ...arabic-english_mailing_lists_parallel_corpus_-_test_set.json | 2 +- ...d_arabic-english_newspaper_parallel_corpus_-_test_set_1.json | 2 +- datasets/troll_detection.json | 2 +- datasets/tsac.json | 2 +- datasets/tufs_media.json | 2 +- datasets/tunizi.json | 2 +- datasets/twifil.json | 2 +- datasets/twt15da_lists.json | 2 +- datasets/tydiqa.json | 2 +- ...standing_and_detecting_dangerous_speech_in_social_media.json | 2 +- datasets/wikiann.json | 2 +- datasets/wikiann_ner(mmner).json | 2 +- datasets/wojood.json | 2 +- datasets/wojoodfine.json | 2 +- datasets/wojoodgaza.json | 2 +- datasets/wojoodhadath.json | 2 +- datasets/wsd.json | 2 +- 208 files changed, 208 insertions(+), 208 deletions(-) diff --git a/README.md b/README.md index 617d084..4b36f5e 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ which gives the following output 'Affiliations': ',The Islamic University of Gaza,,', 'Authors': 'Chatrine Qwaider,Motaz Saad,S. Chatzikyriakidis,Simon Dobnik', 'Citations': '25.0', - 'Collection Style': 'crawling and annotation(other)', + 'Collection Style': 'crawling,annotation', 'Cost': '', 'Derived From': '', 'Description': 'the first Levantine Dialect Corpus (SDC) covering data from the four dialects spoken in Palestine, Jordan, Lebanon and Syria.', diff --git a/datasets/absa-hotels.json b/datasets/absa-hotels.json index beeb2d7..ffa2750 100644 --- a/datasets/absa-hotels.json +++ b/datasets/absa-hotels.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "web pages", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Around 15,562 Hotels' reviews were thoroughly reviewed by this research authors and a subset of 2,291 reviews were selected. The original dataset has been collected from well known Hotels' booking websites such as Booking.com, TripAdvisor.com.", "Volume": "24,028", "Unit": "sentences", diff --git a/datasets/adi-17.json b/datasets/adi-17.json index d82f44f..387c8c8 100644 --- a/datasets/adi-17.json +++ b/datasets/adi-17.json @@ -112,7 +112,7 @@ "Dialect": "mixed", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "dialect identification of speech from YouTube to one of the 17 dialects", "Volume": "3,091", "Unit": "hours", diff --git a/datasets/adi-5.json b/datasets/adi-5.json index b410581..0e6f366 100644 --- a/datasets/adi-5.json +++ b/datasets/adi-5.json @@ -40,7 +40,7 @@ "Dialect": "mixed", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This will be divided across the five major Arabic dialects; Egyptian (EGY), Levantine (LAV), Gulf (GLF), North African (NOR), and Modern Standard Arabic (MSA)", "Volume": "50", "Unit": "hours", diff --git a/datasets/adpbc.json b/datasets/adpbc.json index 21ca1e6..4beb0d5 100644 --- a/datasets/adpbc.json +++ b/datasets/adpbc.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This corpus contains the words and their dependency relation produced by performing some steps", "Volume": "16", "Unit": "documents", diff --git a/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json b/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json index c4acfd2..daa47e9 100644 --- a/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json +++ b/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Adult Content Detection on Arabic Twitter", "Volume": "50,000", "Unit": "sentences", diff --git a/datasets/ajgt.json b/datasets/ajgt.json index 725bf7c..8495d8a 100644 --- a/datasets/ajgt.json +++ b/datasets/ajgt.json @@ -9,7 +9,7 @@ "Dialect": "Jordan", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.", "Volume": "1,800", "Unit": "sentences", diff --git a/datasets/akec.json b/datasets/akec.json index 42648bd..1463aec 100644 --- a/datasets/akec.json +++ b/datasets/akec.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The corpus consists in 160 arabic documents and their keyphrases.", "Volume": "160", "Unit": "documents", diff --git a/datasets/alr__arabic_laptop_reviews_dataset.json b/datasets/alr__arabic_laptop_reviews_dataset.json index b55de66..1099eef 100644 --- a/datasets/alr__arabic_laptop_reviews_dataset.json +++ b/datasets/alr__arabic_laptop_reviews_dataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arabic Laptops Reviews (ALR) dataset focuses on laptops reviews written in Arabic", "Volume": "1,753", "Unit": "sentences", diff --git a/datasets/amara.json b/datasets/amara.json index c9f49cf..5a5e5ba 100644 --- a/datasets/amara.json +++ b/datasets/amara.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "multilingually aligned for 20 languages, i.e. 20 monolingual corpora and 190 parallel corpora", "Volume": "154,301", "Unit": "sentences", diff --git a/datasets/anercorp.json b/datasets/anercorp.json index 7f4bcc0..fdc7a8a 100644 --- a/datasets/anercorp.json +++ b/datasets/anercorp.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "collected from different resources ", "Volume": "316", "Unit": "documents", diff --git a/datasets/anetac.json b/datasets/anetac.json index 68f09db..a43de0f 100644 --- a/datasets/anetac.json +++ b/datasets/anetac.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "English-Arabic named entity transliteration and classification dataset", "Volume": "79,924", "Unit": "sentences", diff --git a/datasets/annotated_shami_corpus.json b/datasets/annotated_shami_corpus.json index b443929..3be4d46 100644 --- a/datasets/annotated_shami_corpus.json +++ b/datasets/annotated_shami_corpus.json @@ -9,7 +9,7 @@ "Dialect": "Lebanon", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Subsection of the Lebanese portion of the Shami Corpus annotated for spelling standardization (CODA), morphological segmentation and tagging, and spontaneous orthography taxonomy tagging.", "Volume": "10,000", "Unit": "tokens", diff --git a/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json b/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json index 64ef8f8..f610e70 100644 --- a/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json +++ b/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "In total, 17,103 sequences were annotated from 585,163 tweets (196,374 in English, 254,748 in French and 134,041 in Arabizi), including the themes \u201cOthers\u201d and \u201cIncomprehensible\u201d. Among these sequences, 4,578 sequences having at least 20 tweets annotated with the 3 predefined themes (Hooliganism, Racism and Terrorism) were obtained, including 1,866 sequences with an opinion change. They are distributed as follows: 2,141 sequences in English (57,655 tweets), 1,942 sequences in French (48,854 tweets) and 495 sequences in Arabizi (21,216 tweets). A sub-corpus of 8,733 tweets (1,209 in English, 3,938 in French and 3,585 in Arabizi) annotated as \u201chateful\u201d, according to topic/opinion annotations and by selecting tweets that contained insults, is also provided. ", "Volume": "134,041", "Unit": "sentences", diff --git a/datasets/ans_corpus___claim_verification.json b/datasets/ans_corpus___claim_verification.json index bdb1066..9e2cf71 100644 --- a/datasets/ans_corpus___claim_verification.json +++ b/datasets/ans_corpus___claim_verification.json @@ -16,7 +16,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence).", "Volume": "4,547", "Unit": "sentences", diff --git a/datasets/anti-social_behaviour_in_online_communication.json b/datasets/anti-social_behaviour_in_online_communication.json index 39e549e..f499187 100644 --- a/datasets/anti-social_behaviour_in_online_communication.json +++ b/datasets/anti-social_behaviour_in_online_communication.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a corpus of 15,050 labelled YouTube comments in Arabic", "Volume": "15,050", "Unit": "sentences", diff --git a/datasets/aoc-aldi.json b/datasets/aoc-aldi.json index 44e95e6..bc949d6 100644 --- a/datasets/aoc-aldi.json +++ b/datasets/aoc-aldi.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "commentary", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Comments to news articles with a continuous level of dialectness score between 0 and 1.", "Volume": "127,835", "Unit": "sentences", diff --git a/datasets/aoc.json b/datasets/aoc.json index a68c067..c05a69f 100644 --- a/datasets/aoc.json +++ b/datasets/aoc.json @@ -22,7 +22,7 @@ "Dialect": "mixed", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a 52M-word monolingual dataset rich in dialectal content", "Volume": "108,000", "Unit": "sentences", diff --git a/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json b/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json index e015df0..f3c1155 100644 --- a/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json +++ b/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a corpus designed to support research on gender bias in natural language processing applications working on Arabic", "Volume": "12,000", "Unit": "sentences", diff --git a/datasets/aqmar.json b/datasets/aqmar.json index 203d77f..699ef55 100644 --- a/datasets/aqmar.json +++ b/datasets/aqmar.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This is a 74,000-token corpus of 28 Arabic Wikipedia articles hand-annotated for named entities.", "Volume": "74,000", "Unit": "tokens", diff --git a/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json b/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json index 6d68d36..35f5380 100644 --- a/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json +++ b/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A large corpus for generating Arabic word embeddings from multiple sources such as news articles, consumer reviews, Quran text, and tweets. The embeddings are used to perform sentiment analysis in both Standard and Dialectal Arabic without relying on hand-crafted features. The embeddings are applied to several binary classifiers to detect subjectivity and sentiment in Arabic texts.", "Volume": "190,000,000", "Unit": "tokens", diff --git a/datasets/arab-acquis.json b/datasets/arab-acquis.json index 7ab7b2a..2f612cd 100644 --- a/datasets/arab-acquis.json +++ b/datasets/arab-acquis.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "consists of over 12,000 sentences from the JRCAcquis (Acquis Communautaire) corpus ", "Volume": "12,000", "Unit": "sentences", diff --git a/datasets/arab-esl.json b/datasets/arab-esl.json index 4208eb7..dfaf1e7 100644 --- a/datasets/arab-esl.json +++ b/datasets/arab-esl.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Emoji (the popular digital pictograms) are sometimes seen as a new kind of artificial and universally usable and consistent writing code. In spite of their assumed universality, there is some evidence that the sense of an emoji, specifically in regard to sentiment, may change from language to language and culture to culture. This paper investigates whether contextual emoji sentiment analysis is consistent across Arabic and European languages. To conduct this investigation, we, first, created the Arabic emoji sentiment lexicon (Arab-ESL). Then, we exploited an existing European emoji sentiment lexicon to compare the sentiment conveyed in each of the two families of language and culture (Arabic and European). The results show that the pairwise correlation between the two lexicons is consistent for emoji that represent, for instance, hearts, facial expressions, and body language. However, for a subset of emoji (those that represent objects, nature, symbols, and some human activities), there are large differences in the sentiment conveyed. More interestingly, an extremely high level of inconsistency has been shown with food emoji.", "Volume": "1,034", "Unit": "tokens", diff --git a/datasets/arabic-dialect_english_parallel_text.json b/datasets/arabic-dialect_english_parallel_text.json index 4e46029..cc66169 100644 --- a/datasets/arabic-dialect_english_parallel_text.json +++ b/datasets/arabic-dialect_english_parallel_text.json @@ -16,7 +16,7 @@ "Dialect": "Levant", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "it uses crowdsourcing to cheaply and quickly build LevantineEnglish and Egyptian-English parallel corpora, consisting of 1.1M words and 380k words, respectively.", "Volume": "1,500,000", "Unit": "tokens", diff --git a/datasets/arabic-english_named_entities_dataset.json b/datasets/arabic-english_named_entities_dataset.json index 1a69d49..feac442 100644 --- a/datasets/arabic-english_named_entities_dataset.json +++ b/datasets/arabic-english_named_entities_dataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "Arabic-ENglish named entities dataset is created using DBpedia Linked datasets and parallel corpus. For annotating NE in monolingual English corpus we used Gate tool. Our approach is based on linked data entities by mapping them to Gate Gazetteers, and then constructing a type-oriented NE base covering person, Location and organization classes. The second task consists of the use of machine translation to translate these entities and then finally, generating our NE lexicon that encloses the list of Arabic entities that match to the English lists.", "Volume": "48,753", "Unit": "tokens", diff --git a/datasets/arabic_dialects_dataset.json b/datasets/arabic_dialects_dataset.json index 4253fc2..9478304 100644 --- a/datasets/arabic_dialects_dataset.json +++ b/datasets/arabic_dialects_dataset.json @@ -40,7 +40,7 @@ "Dialect": "mixed", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Dataset of Arabic dialects for GULF, EGYPT, LEVANT, TONESIAN Arabic dialects in addition to MSA.", "Volume": "16,494", "Unit": "sentences", diff --git a/datasets/arabic_flood_twitter_dataset.json b/datasets/arabic_flood_twitter_dataset.json index 39bb4a6..9234d80 100644 --- a/datasets/arabic_flood_twitter_dataset.json +++ b/datasets/arabic_flood_twitter_dataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "It includes 4,037 human-labelled Arabic Twitter messages for four high-risk flood events that occurred in 2018", "Volume": "4,037", "Unit": "sentences", diff --git a/datasets/arabic_hate_speech_2022_shared_task.json b/datasets/arabic_hate_speech_2022_shared_task.json index 8d23c2b..f80d029 100644 --- a/datasets/arabic_hate_speech_2022_shared_task.json +++ b/datasets/arabic_hate_speech_2022_shared_task.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "largest Arabic dataset for offensive, fine-grained hate speech, vulgar and violence content", "Volume": "12,698", "Unit": "sentences", diff --git a/datasets/arabic_keyphrase_dataset.json b/datasets/arabic_keyphrase_dataset.json index 0ca2613..25b7d0e 100644 --- a/datasets/arabic_keyphrase_dataset.json +++ b/datasets/arabic_keyphrase_dataset.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A dataset in Arabic language for automatic keyphrase extraction algorithms", "Volume": "400", "Unit": "documents", diff --git a/datasets/arabic_named_entities.json b/datasets/arabic_named_entities.json index 1b67fce..97c96a8 100644 --- a/datasets/arabic_named_entities.json +++ b/datasets/arabic_named_entities.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "we have extracted\r\napproximately 45,000 Arabic NE", "Volume": "45,000", "Unit": "tokens", diff --git a/datasets/arabic_named_entity_gazetteer.json b/datasets/arabic_named_entity_gazetteer.json index b1c17af..bd5d06b 100644 --- a/datasets/arabic_named_entity_gazetteer.json +++ b/datasets/arabic_named_entity_gazetteer.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A gazetteer of entities curated from Wikipedia.", "Volume": "68,355", "Unit": "tokens", diff --git a/datasets/arabic_news_dataset_about_hajj.json b/datasets/arabic_news_dataset_about_hajj.json index af85e93..f456198 100644 --- a/datasets/arabic_news_dataset_about_hajj.json +++ b/datasets/arabic_news_dataset_about_hajj.json @@ -9,7 +9,7 @@ "Dialect": "Classical Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "more than 2k articles about Hajj ", "Volume": "2,000", "Unit": "documents", diff --git a/datasets/arabic_news_tweets.json b/datasets/arabic_news_tweets.json index 00a04db..d388c01 100644 --- a/datasets/arabic_news_tweets.json +++ b/datasets/arabic_news_tweets.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This dataset is a relatively great size collection of Arabic news tweets that were collected from an official and verified users in Twitter. All news that is collected from the most popular and official users in Saudi Arabia belongs to Saudi Arabia news. All data that is gathered was retrieved using specific time period and collected all news in that time. To the best of our knowledge, this dataset is the first Arabic news data collection that does not specify by keywords and belongs to Saudi Arabia. This news dataset can be valuable for diverse tasks in NLP, such as text classification and automated verification system. The dataset has been categorized into 5 different news classes which are general news, regions news, sport news, economic news, and quality life news. In this data article, 89,179 original tweets have presented and fully labeled into related categories.", "Volume": "89,179", "Unit": "sentences", diff --git a/datasets/arabic_osact4___offensive_language_detection.json b/datasets/arabic_osact4___offensive_language_detection.json index b7dfc41..307055e 100644 --- a/datasets/arabic_osact4___offensive_language_detection.json +++ b/datasets/arabic_osact4___offensive_language_detection.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "OSACT4 Shared Task on Offensive Language Detection", "Volume": "8,000", "Unit": "sentences", diff --git a/datasets/arabic_osact5___arabic_hate_speech.json b/datasets/arabic_osact5___arabic_hate_speech.json index 17e9616..d797998 100644 --- a/datasets/arabic_osact5___arabic_hate_speech.json +++ b/datasets/arabic_osact5___arabic_hate_speech.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Fine-Grained Hate Speech Detection on Arabic Twitter", "Volume": "10,157", "Unit": "sentences", diff --git a/datasets/arabic_pos_dialect.json b/datasets/arabic_pos_dialect.json index 30833b1..08e07e7 100644 --- a/datasets/arabic_pos_dialect.json +++ b/datasets/arabic_pos_dialect.json @@ -34,7 +34,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "includes tweets in Egyptian, Levantine, Gulf, and Maghrebi, with 350 tweets for each dialect with appropriate train/test/development splits for 5-fold cross validation", "Volume": "1,400", "Unit": "sentences", diff --git a/datasets/arabic_punctuation_dataset.json b/datasets/arabic_punctuation_dataset.json index e313f41..253a7e0 100644 --- a/datasets/arabic_punctuation_dataset.json +++ b/datasets/arabic_punctuation_dataset.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "books", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "This is a curated dataset, specifically designed to facilitate the study of punctuation. It has undergone rigorous manual annotation and verification on the basis of sentence structure, with sentence boundaries clearly marked. ", "Volume": "12,183,000", "Unit": "sentences", diff --git a/datasets/arabic_rc_datasets.json b/datasets/arabic_rc_datasets.json index f608d47..9999d12 100644 --- a/datasets/arabic_rc_datasets.json +++ b/datasets/arabic_rc_datasets.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arabic Reading Comprehension Benchmarks Created Semiautomatically", "Volume": "2,862", "Unit": "sentences", diff --git a/datasets/arabic_satire_dataset.json b/datasets/arabic_satire_dataset.json index 9ef4da6..30553e3 100644 --- a/datasets/arabic_satire_dataset.json +++ b/datasets/arabic_satire_dataset.json @@ -9,7 +9,7 @@ "Dialect": "Classical Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "500 Arabic news and 500 Arabic satire articles ", "Volume": "1,000", "Unit": "sentences", diff --git a/datasets/arabic_sentiment_lexicons.json b/datasets/arabic_sentiment_lexicons.json index a93520a..6775875 100644 --- a/datasets/arabic_sentiment_lexicons.json +++ b/datasets/arabic_sentiment_lexicons.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": " by using distant supervision techniques on Arabic tweets, and by translating English sentiment lexicons into Arabic using a freely available statistical machine translation system", "Volume": "176,364", "Unit": "tokens", diff --git a/datasets/arabic_sentiment_twitter_corpus.json b/datasets/arabic_sentiment_twitter_corpus.json index 291c902..c37758c 100644 --- a/datasets/arabic_sentiment_twitter_corpus.json +++ b/datasets/arabic_sentiment_twitter_corpus.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A Sentiment Analysis dataset. No extra information is provided regarding the dialects nor the collection methodology", "Volume": "58,000", "Unit": "sentences", diff --git a/datasets/arabic_spam_and_ham_tweets.json b/datasets/arabic_spam_and_ham_tweets.json index 270cf46..76ea6e2 100644 --- a/datasets/arabic_spam_and_ham_tweets.json +++ b/datasets/arabic_spam_and_ham_tweets.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The dataset contains 13241 records. Each record represents a tweet. The tweets are labeled either Ham or Spam. Ham means non-spam tweet. There are 1924 Spam tweets and 11299 Ham tweets. The tweets are unique i.e. there are no repeated tweets records.", "Volume": "13,241", "Unit": "sentences", diff --git a/datasets/arabic_tweets_about_infectious_diseases.json b/datasets/arabic_tweets_about_infectious_diseases.json index 4e91cb6..1d35d87 100644 --- a/datasets/arabic_tweets_about_infectious_diseases.json +++ b/datasets/arabic_tweets_about_infectious_diseases.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This file contains a dataset of 1266 tweets by two Arabic native speakers into five types of sources: academic, media, government, health professional, and public.", "Volume": "1,266", "Unit": "sentences", diff --git a/datasets/arabic_wikireading_and_kaiflematha.json b/datasets/arabic_wikireading_and_kaiflematha.json index 983f13e..1142a85 100644 --- a/datasets/arabic_wikireading_and_kaiflematha.json +++ b/datasets/arabic_wikireading_and_kaiflematha.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "high quality and large-scale Arabic reading comprehension datasets: Arabic WikiReading and KaifLematha with around +100 K instances.", "Volume": "100,000", "Unit": "documents", diff --git a/datasets/arabicaqa.json b/datasets/arabicaqa.json index 2e2e0cf..188d7b0 100644 --- a/datasets/arabicaqa.json +++ b/datasets/arabicaqa.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "ArabicaQA is a robust dataset designed to support and advance the development of Arabic Question Answering (QA) systems. This dataset encompasses a wide range of question types, including both Machine Reading Comprehension (MRC) and Open-Domain questions, catering to various aspects of QA research and application. The dataset is structured to facilitate training, validation, and testing of Arabic QA models.", "Volume": "88,946", "Unit": "sentences", diff --git a/datasets/arabichatespeechdataset.json b/datasets/arabichatespeechdataset.json index 368aaf9..8ea56b0 100644 --- a/datasets/arabichatespeechdataset.json +++ b/datasets/arabichatespeechdataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The dataset contains 5361 Arabic tweets annotated with six categories: clean, offensive, and hateful speech (religion-based, ethnicity-based, nationality-based, gender-based). It focuses on Arabic dialects (Gulf and Modern Standard Arabic) and uses a three-level annotation schema.", "Volume": "5361", "Unit": "sentences", diff --git a/datasets/arabicmmlu.json b/datasets/arabicmmlu.json index 733a926..69b784b 100644 --- a/datasets/arabicmmlu.json +++ b/datasets/arabicmmlu.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "ArabicMMLU is the first multi-task language understanding benchmark for Arabic language, sourced from school exams across diverse educational levels in different countries spanning North Africa, the Levant, and the Gulf regions. Our data comprises 40 tasks and 14,575 multiple-choice questions in Modern Standard Arabic (MSA), and is carefully constructed by collaborating with native speakers in the region.", "Volume": "14,575", "Unit": "sentences", diff --git a/datasets/arabicsa.json b/datasets/arabicsa.json index 90c9223..54a2fb4 100644 --- a/datasets/arabicsa.json +++ b/datasets/arabicsa.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "Arabic Emoticon Lexicon: Contains Arabic words and phrases frequently used on Twitter with sentiment annotation based on co-occurrence with positive and negative seed words. Arabic Hashtag Lexicon: Annotated with sentiment based on commonly used Arabic hashtags from Twitter. Arabic Hashtag Lexicon (dialectal): Focuses on dialectal Arabic words. BBN Blog Posts Sentiment Corpus: Contains Levantine dialect social media posts, manually annotated for sentiment. Syria Tweets Sentiment Corpus: A collection of tweets from Syria, manually annotated for sentiment.", "Volume": "231,155", "Unit": "tokens", diff --git a/datasets/aracovid19-mfh.json b/datasets/aracovid19-mfh.json index 0eb5ca3..46763f6 100644 --- a/datasets/aracovid19-mfh.json +++ b/datasets/aracovid19-mfh.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "multi-label fake news and hate speech detection dataset each sentence is annotated with 10 labels", "Volume": "10,828", "Unit": "sentences", diff --git a/datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json b/datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json index abc3487..6f96e9b 100644 --- a/datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json +++ b/datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "AraCovid19-SSD is a manually annotated Arabic COVID-19 sarcasm and sentiment detection dataset containing 5,162 tweets.", "Volume": "5,162", "Unit": "sentences", diff --git a/datasets/aracust.json b/datasets/aracust.json index 4964cd8..3517d8b 100644 --- a/datasets/aracust.json +++ b/datasets/aracust.json @@ -9,7 +9,7 @@ "Dialect": "Saudi Arabia", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Saudi Telecom Tweets corpus for sentiment analysis", "Volume": "20,000", "Unit": "sentences", diff --git a/datasets/arafacts.json b/datasets/arafacts.json index 45918fc..2225c8d 100644 --- a/datasets/arafacts.json +++ b/datasets/arafacts.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "consists of 6,222 claims\r\nalong with their factual labels and additional\r\nmetadata, such as fact-checking article content,\r\ntopical category, and links to posts or Web\r\npages spreading the claim", "Volume": "6,222", "Unit": "sentences", diff --git a/datasets/aranews.json b/datasets/aranews.json index cc9897d..0fb0e93 100644 --- a/datasets/aranews.json +++ b/datasets/aranews.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a large-scale, multi-topic, and multi-country Arabic news dataset", "Volume": "1,000,000", "Unit": "documents", diff --git a/datasets/arap-tweet_corpus.json b/datasets/arap-tweet_corpus.json index 07669f0..1234e01 100644 --- a/datasets/arap-tweet_corpus.json +++ b/datasets/arap-tweet_corpus.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arap-Tweet is a large-scale, multi-dialectal Arabic Twitter corpus containing 2.4 million tweets from 11 regions across 16 countries in the Arab world. The dataset includes annotations for dialect, age group, and gender of the users. Tweets were collected using region-specific keywords to ensure accurate dialect identification, and the profiles of users were manually verified and annotated by trained annotators. This corpus is designed to support research in author profiling, stylometry, and dialect identification, providing a valuable resource for natural language processing applications in Arabic.", "Volume": "2,400,000", "Unit": "sentences", diff --git a/datasets/arasencorpus.json b/datasets/arasencorpus.json index fcb371a..35ccb83 100644 --- a/datasets/arasencorpus.json +++ b/datasets/arasencorpus.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " contains 4.5 million tweets and covers both modern standard Arabic and some of the Arabic dialects", "Volume": "4,500,000", "Unit": "sentences", diff --git a/datasets/arasenti.json b/datasets/arasenti.json index aad462b..41d7ba9 100644 --- a/datasets/arasenti.json +++ b/datasets/arasenti.json @@ -9,7 +9,7 @@ "Dialect": "Saudi Arabia", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The corpus contains 17,573 tweets labelled with four labels for sentiment: positive, negative, neutral and mixed", "Volume": "17,573", "Unit": "sentences", diff --git a/datasets/arastance.json b/datasets/arastance.json index 882ec34..7c0987a 100644 --- a/datasets/arastance.json +++ b/datasets/arastance.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "covers false and true claims from multiple domains (e.g., politics, sports, health) and several Arab countries", "Volume": "4,063", "Unit": "sentences", diff --git a/datasets/arc-wmi.json b/datasets/arc-wmi.json index 2c2a32d..89da22a 100644 --- a/datasets/arc-wmi.json +++ b/datasets/arc-wmi.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " 4476 sentences with over 61k words, extracted from 94 sources of Arabic written medicine information", "Volume": "4,476", "Unit": "sentences", diff --git a/datasets/arcd.json b/datasets/arcd.json index 845e027..0895ed7 100644 --- a/datasets/arcd.json +++ b/datasets/arcd.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "1,395 questions posed by crowdworkers on Wikipedia articles", "Volume": "1,395", "Unit": "sentences", diff --git a/datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json b/datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json index 4810367..44396df 100644 --- a/datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json +++ b/datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Collected to prevent\r\nspreading of rumors and misinformation about\r\nthe virus or bad cures", "Volume": "8,000", "Unit": "sentences", diff --git a/datasets/arcov-19.json b/datasets/arcov-19.json index cddf51b..58d777c 100644 --- a/datasets/arcov-19.json +++ b/datasets/arcov-19.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arabic COVID-19 Twitter dataset that covers the period from 27th of January till 5th of May 2021.", "Volume": "3,140,158", "Unit": "sentences", diff --git a/datasets/arcov19-rumors.json b/datasets/arcov19-rumors.json index 02dffb7..a793cbd 100644 --- a/datasets/arcov19-rumors.json +++ b/datasets/arcov19-rumors.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The first Arabic dataset for rumors verification in Twitter", "Volume": "9,414", "Unit": "sentences", diff --git a/datasets/arcovidvac.json b/datasets/arcovidvac.json index 0b69876..b0b1c78 100644 --- a/datasets/arcovidvac.json +++ b/datasets/arcovidvac.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "the largest manually annotated Arabic tweet dataset, ArCovidVac, for the COVID-19 vaccination campaign, covering many countries in the Arab region", "Volume": "10,000", "Unit": "sentences", diff --git a/datasets/arentail.json b/datasets/arentail.json index 417d577..6d337db 100644 --- a/datasets/arentail.json +++ b/datasets/arentail.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arabic NLI dataset called ArEntail, consisting of 6000 sentence pairs collected from news headlines and manually labeled to indicate whether an entailment relationship links the sentences or not without resorting to machine translation from English datasets", "Volume": "6,000", "Unit": "sentences", diff --git a/datasets/armi__arabic_misogynistic_dataset.json b/datasets/armi__arabic_misogynistic_dataset.json index 81a06e5..7b40352 100644 --- a/datasets/armi__arabic_misogynistic_dataset.json +++ b/datasets/armi__arabic_misogynistic_dataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arabic multidialectal dataset for misogynistic language", "Volume": "9,833", "Unit": "sentences", diff --git a/datasets/arparallel.json b/datasets/arparallel.json index 2ec7b49..dbfe7b1 100644 --- a/datasets/arparallel.json +++ b/datasets/arparallel.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "The first monolingual parallel corpus of Arabic generated automatically from translating a bilingual English-French corpus. It can be used to train sequence-to-sequence models for paraphrasing, machine translation, text simplification, and other language generation tasks.", "Volume": "100,000", "Unit": "sentences", diff --git a/datasets/arquad.json b/datasets/arquad.json index 3aa41ff..641de46 100644 --- a/datasets/arquad.json +++ b/datasets/arquad.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a large MRC dataset for the Arabic language. The dataset comprises 16,020 questions posed by language experts on passages extracted from Arabic Wikipedia articles, where the answer to each question is a text segment from the corresponding reading passage.", "Volume": "16,020", "Unit": "sentences", diff --git a/datasets/arsarcasm-v2.json b/datasets/arsarcasm-v2.json index d4b0f54..4f1f484 100644 --- a/datasets/arsarcasm-v2.json +++ b/datasets/arsarcasm-v2.json @@ -40,7 +40,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "ArSarcasm-v2 is an extension of the original ArSarcasm dataset published along with the paper From Arabic Sentiment Analysis to Sarcasm Detection: The ArSarcasm Dataset. ArSarcasm-v2 consists of ArSarcasm along with portions of DAICT corpus and some new tweets. Each tweet was annotated for sarcasm, sentiment and dialect. The final dataset consists of 15,548 tweets divided into 12,548 training tweets and 3,000 testing tweets. ArSarcasm-v2 was used and released as a part of the shared task on sarcasm detection and sentiment analysis in Arabic", "Volume": "15,548", "Unit": "sentences", diff --git a/datasets/arsarcasm.json b/datasets/arsarcasm.json index 2d8ff07..2b8f43f 100644 --- a/datasets/arsarcasm.json +++ b/datasets/arsarcasm.json @@ -40,7 +40,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The dataset was created using previously available Arabic sentiment analysis datasets", "Volume": "8,437", "Unit": "sentences", diff --git a/datasets/arsas.json b/datasets/arsas.json index f7cad91..4ffd6a4 100644 --- a/datasets/arsas.json +++ b/datasets/arsas.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A set of 21K Arabic tweets labeled for 4 classes of sentiment and 6 classes of speech-act", "Volume": "21,000", "Unit": "sentences", diff --git a/datasets/arsen-20.json b/datasets/arsen-20.json index c4a9ee4..47d4c2a 100644 --- a/datasets/arsen-20.json +++ b/datasets/arsen-20.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "ArSen-20, a benchmark dataset tailored to propel Arabic sentiment detection forward. ArSen-20 comprises 20,000 professionally labeled tweets sourced from Twitter, focusing on the theme of COVID-19 and spanning the period from 2020 to 2023", "Volume": "20,000", "Unit": "sentences", diff --git a/datasets/arsentd-lev.json b/datasets/arsentd-lev.json index 22ee6d7..62d3bdb 100644 --- a/datasets/arsentd-lev.json +++ b/datasets/arsentd-lev.json @@ -9,7 +9,7 @@ "Dialect": "Levant", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "ArSentD-LEV is a multi-topic corpus for target-based sentiment analysis in Arabic Levantine tweets", "Volume": "4,000", "Unit": "sentences", diff --git a/datasets/artest.json b/datasets/artest.json index 1cd507c..490f1e6 100644 --- a/datasets/artest.json +++ b/datasets/artest.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "web pages", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "ArTest was built on top of ArabicWeb'16 Web collection. If you are interested in getting the collection, please check our ArabicWeb16 Website", "Volume": "10,529", "Unit": "sentences", diff --git a/datasets/artrivia.json b/datasets/artrivia.json index a9d97ea..1c18710 100644 --- a/datasets/artrivia.json +++ b/datasets/artrivia.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "10,045 question-answer-passages triplets extracted from Wikipedia passages", "Volume": "10,045", "Unit": "sentences", diff --git a/datasets/asad.json b/datasets/asad.json index a5a9fc6..e210468 100644 --- a/datasets/asad.json +++ b/datasets/asad.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " ASAD is a large, high-quality annotated dataset\r\n(including 95K tweets), with three-class sentiment labels (positive, negative and neutral)", "Volume": "100,000", "Unit": "sentences", diff --git a/datasets/astad.json b/datasets/astad.json index 7e892d3..1dae034 100644 --- a/datasets/astad.json +++ b/datasets/astad.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "It contains 58K Arabic tweets (47K training, 11K test) tweets annotated in positive and negative labels", "Volume": "58,000", "Unit": "sentences", diff --git a/datasets/astd.json b/datasets/astd.json index 64bf613..7335efa 100644 --- a/datasets/astd.json +++ b/datasets/astd.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "10k Arabic sentiment tweets classified into four classes subjective positive, subjective negative, subjective mixed, and objective", "Volume": "10,006", "Unit": "sentences", diff --git a/datasets/at-odtsa.json b/datasets/at-odtsa.json index 0e37041..d9dd731 100644 --- a/datasets/at-odtsa.json +++ b/datasets/at-odtsa.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A dataset of Arabic Tweets for Open-Domain Targeted Sentiment Analysis, which includes Arabic tweets along with labels that specify targets (topics) and sentiments (opinions) expressed in the collected tweets.", "Volume": "3,000", "Unit": "sentences", diff --git a/datasets/attimam.json b/datasets/attimam.json index 05e018e..ed08ebc 100644 --- a/datasets/attimam.json +++ b/datasets/attimam.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "AttImam was developed by Al-Imam Mohammad Ibn Saud Islamic University and consists of approximately 2,000 attribution relations applied to Arabic newswire text from Arabic Treebank: Part 1 v 4.1 (LDC2010T13). Attribution refers to the process of reporting or assigning an utterance to the correct speaker.", "Volume": "2,000", "Unit": "sentences", diff --git a/datasets/author_attribution_tweets.json b/datasets/author_attribution_tweets.json index 13ef882..007abdf 100644 --- a/datasets/author_attribution_tweets.json +++ b/datasets/author_attribution_tweets.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " consists of 71,397 tweets for 45 authors for MSA collected from twitter. ", "Volume": "71,397", "Unit": "sentences", diff --git a/datasets/autotweet.json b/datasets/autotweet.json index 464338e..778b4e6 100644 --- a/datasets/autotweet.json +++ b/datasets/autotweet.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " classification of Arabic tweets into automated or manual.", "Volume": "3,503", "Unit": "sentences", diff --git a/datasets/baec.json b/datasets/baec.json index 1cb71ed..6412b46 100644 --- a/datasets/baec.json +++ b/datasets/baec.json @@ -28,7 +28,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "consists of 45,251 words and is 436 KB in size.It was collected from different Facebook pages", "Volume": "473,796", "Unit": "tokens", diff --git a/datasets/bbn_blog_posts_sentiment_corpus.json b/datasets/bbn_blog_posts_sentiment_corpus.json index ebfcbd8..1102ca5 100644 --- a/datasets/bbn_blog_posts_sentiment_corpus.json +++ b/datasets/bbn_blog_posts_sentiment_corpus.json @@ -9,7 +9,7 @@ "Dialect": "Levant", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A random subset of 1200 Levantine dialectal sentences\r\nchosen from the BBN Arabic-Dialect\u2013English Parallel Text", "Volume": "1,200", "Unit": "sentences", diff --git a/datasets/brad_1_0.json b/datasets/brad_1_0.json index f690781..ad1e9a7 100644 --- a/datasets/brad_1_0.json +++ b/datasets/brad_1_0.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The reviews were collected from GoodReads.com website during June/July 2016", "Volume": "156,506", "Unit": "sentences", diff --git a/datasets/calliar.json b/datasets/calliar.json index 4a574c1..2477a37 100644 --- a/datasets/calliar.json +++ b/datasets/calliar.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "images", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Calliar is a dataset for Arabic calligraphy. The dataset consists of 2500 json files that contain strokes manually annotated for Arabic calligraphy. This repository contains the dataset for the following paper ", "Volume": "2,500", "Unit": "images", diff --git a/datasets/checkthat-ar.json b/datasets/checkthat-ar.json index ce4a402..22f6ece 100644 --- a/datasets/checkthat-ar.json +++ b/datasets/checkthat-ar.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "check-worthiness datasets", "Volume": "7,500", "Unit": "sentences", diff --git a/datasets/cidar.json b/datasets/cidar.json index 6aa2cec..07db50c 100644 --- a/datasets/cidar.json +++ b/datasets/cidar.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "CIDAR contains 10,000 instructions and their output. The dataset was created by selecting around 9,109 samples from Alpagasus dataset then translating it to Arabic using ChatGPT. In addition, we append that with around 891 Arabic grammar instructions from the webiste Ask the teacher. All the 10,000 samples were reviewed by around 12 reviewers.", "Volume": "10,000", "Unit": "sentences", diff --git a/datasets/comparable_wikipedia_coprus.json b/datasets/comparable_wikipedia_coprus.json index 4a8b8f9..ad8c734 100644 --- a/datasets/comparable_wikipedia_coprus.json +++ b/datasets/comparable_wikipedia_coprus.json @@ -22,7 +22,7 @@ "Dialect": "mixed", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Comparable Wikipedia Corpus (aligned documents) Corpus extracts from 20-01-2017 Wikipedia dumps", "Volume": "20,394", "Unit": "documents", diff --git a/datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json b/datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json index 853792c..7151b41 100644 --- a/datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json +++ b/datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json @@ -22,7 +22,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "An evenly balanced dataset of Arabic dialects, Egyptian and Gulf using a variety of dialectal terms.", "Volume": "300,000", "Unit": "sentences", diff --git a/datasets/corpus_of_offensive_language_in_arabic.json b/datasets/corpus_of_offensive_language_in_arabic.json index a7d96d1..ef510c8 100644 --- a/datasets/corpus_of_offensive_language_in_arabic.json +++ b/datasets/corpus_of_offensive_language_in_arabic.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a dataset of YouTube comments in Arabic, specifically designed to be used for thedetection of offensive language in a machine learning scenario", "Volume": "15,050", "Unit": "sentences", diff --git a/datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json b/datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json index 068f649..6905e42 100644 --- a/datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json +++ b/datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "With the emergence of the COVID-19 pandemic, the political and the medical aspects of disinformation merged as the problem got elevated to a whole new level to become the first global infodemic. Fighting this infodemic has been declared one of the most important focus areas of the World Health Organization, with dangers ranging from promoting fake cures, rumors, and conspiracy theories to spreading xenophobia and panic. Addressing the issue requires solving a number of challenging problems such as identifying messages containing claims, determining their check-worthiness and factuality, and their potential to do harm as well as the nature of that harm, to mention just a few. To address this gap, we release a large dataset of 16K manually annotated tweets for fine-grained disinformation analysis that focuses on COVID-19, combines the perspectives and the interests of journalists, fact-checkers, social media platforms, policy makers, and society, and covers Arabic, Bulgarian, Dutch, and English. Finally, we show strong evaluation results using pretrained Transformers, thus confirming the practical utility of the dataset in monolingual multilingual, and single task vs. multitask settings.", "Volume": "5,000", "Unit": "sentences", diff --git a/datasets/covid-fakes.json b/datasets/covid-fakes.json index 6fea0c8..63231b5 100644 --- a/datasets/covid-fakes.json +++ b/datasets/covid-fakes.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Bilingual (Arabic/English) COVID-19 Twitter dataset for misleading information detection", "Volume": "3,263,000", "Unit": "sentences", diff --git a/datasets/covost_2.json b/datasets/covost_2.json index 8524683..eeae86f 100644 --- a/datasets/covost_2.json +++ b/datasets/covost_2.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a large-scale multilingual ST corpus based on Common Voice, to foster ST research with the largest ever open dataset. Its latest version covers translations from English into 15 languages---Arabic, Catalan, Welsh, German, Estonian, Persian, Indonesian, Japanese, Latvian, Mongolian, Slovenian, Swedish, Tamil, Turkish, Chinese", "Volume": "6", "Unit": "hours", diff --git a/datasets/cqa-md__semeval-2016_task_3.json b/datasets/cqa-md__semeval-2016_task_3.json index 4a1ff03..394ae89 100644 --- a/datasets/cqa-md__semeval-2016_task_3.json +++ b/datasets/cqa-md__semeval-2016_task_3.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "web pages", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "It includes a TRAIN/DEV split with reliable double-checked DEV (1,281 original questions, and 37,795 potentially related question-answer pairs) + unannotated (163,383 question--answer pairs)", "Volume": "45,164", "Unit": "sentences", diff --git a/datasets/daict.json b/datasets/daict.json index d3ecd3e..c10e725 100644 --- a/datasets/daict.json +++ b/datasets/daict.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The dataset includes 5,588 tweets -- written in both MSA and dialectual Arabic -- manually annotated by two professional linguistics from HBKU", "Volume": "5,588", "Unit": "sentences", diff --git a/datasets/dart.json b/datasets/dart.json index f95493f..5aab81a 100644 --- a/datasets/dart.json +++ b/datasets/dart.json @@ -40,7 +40,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Dialectal Arabic Tweets", "Volume": "24,280", "Unit": "sentences", diff --git a/datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json b/datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json index 9377f79..6f42570 100644 --- a/datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json +++ b/datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A Dataset for Arabic Why Question Answering System", "Volume": "3,205", "Unit": "sentences", diff --git a/datasets/defarabicqa.json b/datasets/defarabicqa.json index af7d1f1..6ffe54b 100644 --- a/datasets/defarabicqa.json +++ b/datasets/defarabicqa.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "2000 snippets returned by Google search engine and Wikipedia Arabic version\nand a set of 50 organization definition questions", "Volume": "2,000", "Unit": "sentences", diff --git a/datasets/dziribert.json b/datasets/dziribert.json index ef68030..71691a8 100644 --- a/datasets/dziribert.json +++ b/datasets/dziribert.json @@ -9,7 +9,7 @@ "Dialect": "Algeria", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The DziriBERT dataset contains over 1.1 million tweets written in the Algerian dialect, collected from Twitter. It includes tweets written in both Arabic script and Romanized script (Arabizi). The dataset is designed to develop language models specifically for the Algerian dialect, which differs from Modern Standard Arabic (MSA).", "Volume": "1,100,000", "Unit": "sentences", diff --git a/datasets/edgad.json b/datasets/edgad.json index b14e263..4d0793a 100644 --- a/datasets/edgad.json +++ b/datasets/edgad.json @@ -9,7 +9,7 @@ "Dialect": "Egypt", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Egyptian Dialect Gender Annotated Dataset (EDGAD) obtained from Twitter as well as a proposed text classification solution for the Gender Identification (GI) problem. The dataset consists of 70,000 tweets per gender", "Volume": "140,000", "Unit": "sentences", diff --git a/datasets/emotional-tone.json b/datasets/emotional-tone.json index 0d597be..9fbd4b2 100644 --- a/datasets/emotional-tone.json +++ b/datasets/emotional-tone.json @@ -9,7 +9,7 @@ "Dialect": "Egypt", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "emotion detection dataset", "Volume": "10,065", "Unit": "sentences", diff --git a/datasets/evetar.json b/datasets/evetar.json index b389eb9..0c69a3e 100644 --- a/datasets/evetar.json +++ b/datasets/evetar.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a crawl of 355M Arabic tweets and covers 50 significant events", "Volume": "3,550,000", "Unit": "sentences", diff --git a/datasets/flodusta.json b/datasets/flodusta.json index d7cca8b..5232153 100644 --- a/datasets/flodusta.json +++ b/datasets/flodusta.json @@ -9,7 +9,7 @@ "Dialect": "Saudi Arabia", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "FloDusTA is a dataset of annotated tweets collected for the purpose of developing an event detection system. The dataset contains tweets written in both the MSA and Saudi dialect. Labels are: flood, dust storm, traffic accident, and non-event.", "Volume": "8,998", "Unit": "sentences", diff --git a/datasets/flores-101.json b/datasets/flores-101.json index 60360af..0aacd21 100644 --- a/datasets/flores-101.json +++ b/datasets/flores-101.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Low Resource MT Benchmark", "Volume": "3,100,000", "Unit": "tokens", diff --git a/datasets/gem.json b/datasets/gem.json index 7ee4e71..66db00e 100644 --- a/datasets/gem.json +++ b/datasets/gem.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "benchmark environment for Natural Language Generation with a focus on its Evaluation, both through human annotations and automated Metric", "Volume": "29,229", "Unit": "documents", diff --git a/datasets/gem_-_wikilingua.json b/datasets/gem_-_wikilingua.json index 679e95d..86dae1a 100644 --- a/datasets/gem_-_wikilingua.json +++ b/datasets/gem_-_wikilingua.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "New Benchmark Dataset for Multilingual Abstractive Summarization", "Volume": "29,229", "Unit": "sentences", diff --git a/datasets/gem_-_xlsum.json b/datasets/gem_-_xlsum.json index 5097ca1..2a2a105 100644 --- a/datasets/gem_-_xlsum.json +++ b/datasets/gem_-_xlsum.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Large-Scale Multilingual Abstractive Summarization for 44 Languages\" ", "Volume": "46,897", "Unit": "sentences", diff --git a/datasets/gumar.json b/datasets/gumar.json index b5c3396..138baea 100644 --- a/datasets/gumar.json +++ b/datasets/gumar.json @@ -58,7 +58,7 @@ "Dialect": "mixed", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a large-scale corpus of Gulf Arabic consisting of 110 million words from 1,200 forum novels", "Volume": "1,236", "Unit": "documents", diff --git a/datasets/haad.json b/datasets/haad.json index 50df12a..225c987 100644 --- a/datasets/haad.json +++ b/datasets/haad.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "books", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Human Annotated Arabic Dataset of Book Reviews for Aspect Based Sentiment Analysis", "Volume": "2,389", "Unit": "sentences", diff --git a/datasets/idat.json b/datasets/idat.json index d6ec5cb..cb3317d 100644 --- a/datasets/idat.json +++ b/datasets/idat.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "written in\r\nModern Standard Arabic but also in different Arabic language varieties\r\nincluding Egypt, Gulf, Levantine and Maghrebi dialects", "Volume": "5,030", "Unit": "sentences", diff --git a/datasets/idrisi-r.json b/datasets/idrisi-r.json index 18e3ddc..b8056ba 100644 --- a/datasets/idrisi-r.json +++ b/datasets/idrisi-r.json @@ -34,7 +34,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "IDRISI-R is the largest-scale publicly-available Twitter Location Mention Recognition (LMR) dataset, in both English and Arabic languages. It contains 41 disaster events of different types such as floods, fires, etc. In addition to tagging LMs in text, the LMs are labeled for location types such as countries, cities, streets, POIs, etc.", "Volume": "1,268,912", "Unit": "sentences", diff --git a/datasets/isarcasmeval__semeval-2022_task_6.json b/datasets/isarcasmeval__semeval-2022_task_6.json index b121fa4..0a209ea 100644 --- a/datasets/isarcasmeval__semeval-2022_task_6.json +++ b/datasets/isarcasmeval__semeval-2022_task_6.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A Dataset of Intended Sarcasm", "Volume": "4,447", "Unit": "sentences", diff --git a/datasets/kalamdz.json b/datasets/kalamdz.json index cbba4e0..ed96bc2 100644 --- a/datasets/kalamdz.json +++ b/datasets/kalamdz.json @@ -9,7 +9,7 @@ "Dialect": "Algeria", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "8 major Algerian Arabic sub-dialects with 4881\r\nspeakers and more than 104.4 hours segmented in utterances of at least 6 s", "Volume": "104", "Unit": "hours", diff --git a/datasets/kawarith.json b/datasets/kawarith.json index 91b2008..e9040ac 100644 --- a/datasets/kawarith.json +++ b/datasets/kawarith.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "a multi-dialect Arabic Twitter corpus for crisis events, comprising more than\r\na million Arabic tweets collected during 22\r\ncrises that occurred between 2018 and 2020\r\nand involved several types of hazard", "Volume": "12,446", "Unit": "sentences", diff --git a/datasets/ksaa-rd_dataset.json b/datasets/ksaa-rd_dataset.json index 9710977..9b14168 100644 --- a/datasets/ksaa-rd_dataset.json +++ b/datasets/ksaa-rd_dataset.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": " The KSAA-RD dataset contains over 58,000 Arabic entries and 63,000 English entries. Each entry consists of a word (lemma), its part of speech, and a gloss (definition). The dataset provides both contextualized word embeddings (from AraELECTRA) and fixed word embeddings (from AraVec\u2019s skip-gram model).", "Volume": "58,000", "Unit": "tokens", diff --git a/datasets/ksu_rich_arabic_speech_database.json b/datasets/ksu_rich_arabic_speech_database.json index ffa1768..c06a11f 100644 --- a/datasets/ksu_rich_arabic_speech_database.json +++ b/datasets/ksu_rich_arabic_speech_database.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "it has 752 speakers; the speakers are from different\r\nethnic groups: Saudis, Arabs, and non-Arabs;", "Volume": "590", "Unit": "hours", diff --git a/datasets/l-hsab.json b/datasets/l-hsab.json index 3a5dddf..717ba83 100644 --- a/datasets/l-hsab.json +++ b/datasets/l-hsab.json @@ -9,7 +9,7 @@ "Dialect": "Levant", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arabic Levantine Hate Speech and Abusive Language Dataset", "Volume": "5,851", "Unit": "sentences", diff --git a/datasets/labr.json b/datasets/labr.json index 67b280b..08069c3 100644 --- a/datasets/labr.json +++ b/datasets/labr.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The largest sentiment analysis dataset to-date for the Arabic language.", "Volume": "63,257", "Unit": "sentences", diff --git a/datasets/lama.json b/datasets/lama.json index 1ee7594..d6af3ca 100644 --- a/datasets/lama.json +++ b/datasets/lama.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "A dataset for Modern Standard and Dialectal Arabic emotion detection focused at Robert Plutchik\u2019s 8 basic emotion types", "Volume": "7,000", "Unit": "sentences", diff --git a/datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json b/datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json index 135fd66..8321762 100644 --- a/datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json +++ b/datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Large Multi-Domain Resources for Arabic Sentiment Analysis", "Volume": "45,498", "Unit": "sentences", diff --git a/datasets/let-mi.json b/datasets/let-mi.json index f559863..86457ac 100644 --- a/datasets/let-mi.json +++ b/datasets/let-mi.json @@ -9,7 +9,7 @@ "Dialect": "Levant", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Levantine Twitter dataset for Misogynistic language", "Volume": "6,603", "Unit": "sentences", diff --git a/datasets/lince_-_msa-da__(lid_-_code_switching_).json b/datasets/lince_-_msa-da__(lid_-_code_switching_).json index 9b73b06..11bb3fa 100644 --- a/datasets/lince_-_msa-da__(lid_-_code_switching_).json +++ b/datasets/lince_-_msa-da__(lid_-_code_switching_).json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Twitter data and 9 entity types to establish a new dataset for code-switched NER\r\nbenchmarks.", "Volume": "11,241", "Unit": "sentences", diff --git a/datasets/lince_-_msa-egy_(ner_-_code_switching).json b/datasets/lince_-_msa-egy_(ner_-_code_switching).json index 5223c17..530b754 100644 --- a/datasets/lince_-_msa-egy_(ner_-_code_switching).json +++ b/datasets/lince_-_msa-egy_(ner_-_code_switching).json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Modern Standard ArabicDialectal Arabic (MSA-DA)", "Volume": "11,224", "Unit": "sentences", diff --git a/datasets/lisan.json b/datasets/lisan.json index d610ed4..20a3b87 100644 --- a/datasets/lisan.json +++ b/datasets/lisan.json @@ -34,7 +34,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A morphologically-annotated Yemeni, Sudanese, Iraqi, and Libyan Arabic dialects Lisan corpora. Lisan features around 1.2 million tokens. It was collected the content of the corpora from several social media platforms.", "Volume": "1,200,000", "Unit": "tokens", diff --git a/datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json b/datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json index a0ff05e..154ffc4 100644 --- a/datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json +++ b/datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json @@ -9,7 +9,7 @@ "Dialect": "Gulf", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "MARSA\u2014the largest sentiment annotated corpus for Dialectal Arabic (DA) in the Gulf region, which consists of 61,353 manually labeled tweets that contain a total of 840 K tokens. The tweets were collected from trending hashtags in four domains: political, social, sports, and technology to create a multi-domain corpus.", "Volume": "61,353", "Unit": "sentences", diff --git a/datasets/masc.json b/datasets/masc.json index b311609..b9d2d31 100644 --- a/datasets/masc.json +++ b/datasets/masc.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Multi-domain Arabic Sentiment Corpus (MASC) with a size of 8860 positive and negative reviews from different domains", "Volume": "8,860", "Unit": "sentences", diff --git a/datasets/masc__massive_arabic_speech_corpus.json b/datasets/masc__massive_arabic_speech_corpus.json index 12f945f..de03440 100644 --- a/datasets/masc__massive_arabic_speech_corpus.json +++ b/datasets/masc__massive_arabic_speech_corpus.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This corpus is a dataset that contains 1,000 hours of speech sampled at 16~kHz and crawled from over 700 YouTube channels.", "Volume": "1,000", "Unit": "hours", diff --git a/datasets/mawqif.json b/datasets/mawqif.json index ba3f518..2db1f5c 100644 --- a/datasets/mawqif.json +++ b/datasets/mawqif.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Mawqif is the first Arabic dataset that can be used for target-specific stance detection. This is a multi-label dataset where each data point is annotated for stance, sentiment, and sarcasm.", "Volume": "4,121", "Unit": "sentences", diff --git a/datasets/mcwc.json b/datasets/mcwc.json index e72b841..a655676 100644 --- a/datasets/mcwc.json +++ b/datasets/mcwc.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "The \u201cMultilingual Corpus of World\u2019s Constitutions\u201d (MCWC) is a rich resource available in English, Arabic, and Spanish, encompassing constitutions from various nations. This corpus serves as a vital asset for the NLP community, facilitating advanced research in constitutional analysis, machine translation, and cross-lingual legal studies.", "Volume": "236,156", "Unit": "documents", diff --git a/datasets/mediaspeech.json b/datasets/mediaspeech.json index ccd9ce8..0ac8190 100644 --- a/datasets/mediaspeech.json +++ b/datasets/mediaspeech.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "transcribed audio", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "an open-source 10-hour ASR system evaluation\r\ndataset NTR MediaSpeech for 4 languages: Spanish, French,\r\nTurkish and Arabic", "Volume": "10", "Unit": "hours", diff --git a/datasets/mega-cov.json b/datasets/mega-cov.json index f5fad42..3141bc4 100644 --- a/datasets/mega-cov.json +++ b/datasets/mega-cov.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " A Billion-Scale Dataset of 100+ Languages for COVID-19", "Volume": "45,000,000", "Unit": "sentences", diff --git a/datasets/mgb-2.json b/datasets/mgb-2.json index 47224cb..9e0d6fb 100644 --- a/datasets/mgb-2.json +++ b/datasets/mgb-2.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " from Aljazeera TV programs have been manually captioned with no timing information", "Volume": "1,200", "Unit": "hours", diff --git a/datasets/mgb-3.json b/datasets/mgb-3.json index 690760c..3547abb 100644 --- a/datasets/mgb-3.json +++ b/datasets/mgb-3.json @@ -9,7 +9,7 @@ "Dialect": "Egypt", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "explores multi-genre data; comedy, cooking, cultural, environment, family-kids, fashion, movies-drama, sports, and science talks (TEDX)", "Volume": "16", "Unit": "hours", diff --git a/datasets/mgb-5.json b/datasets/mgb-5.json index 4d1cb08..0902d37 100644 --- a/datasets/mgb-5.json +++ b/datasets/mgb-5.json @@ -9,7 +9,7 @@ "Dialect": "Morocco", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Moroccan Arabic speech extracted from 93 YouTube videos distributed across seven genres: comedy, cooking, family/children, fashion, drama, sports, and science clips.", "Volume": "14", "Unit": "hours", diff --git a/datasets/mlma_hate_speech.json b/datasets/mlma_hate_speech.json index 573f804..5d734aa 100644 --- a/datasets/mlma_hate_speech.json +++ b/datasets/mlma_hate_speech.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Multilingual and Multi-Aspect Hate Speech Analysis", "Volume": "3,354", "Unit": "sentences", diff --git a/datasets/mlqa.json b/datasets/mlqa.json index 6470438..8d55dd0 100644 --- a/datasets/mlqa.json +++ b/datasets/mlqa.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese.", "Volume": "5,852", "Unit": "documents", diff --git a/datasets/mpold__multi_platforms_offensive_language_dataset.json b/datasets/mpold__multi_platforms_offensive_language_dataset.json index f2921d4..d40d381 100644 --- a/datasets/mpold__multi_platforms_offensive_language_dataset.json +++ b/datasets/mpold__multi_platforms_offensive_language_dataset.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Arabic Offensive Comments dataset from Multiple Social Media Platforms", "Volume": "400", "Unit": "documents", diff --git a/datasets/msac.json b/datasets/msac.json index 3d1fb8a..2cdf567 100644 --- a/datasets/msac.json +++ b/datasets/msac.json @@ -9,7 +9,7 @@ "Dialect": "Morocco", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "rich and publicly available Arabic corpus called Moroccan Sentiment Analysis Corpus (MSAC)", "Volume": "2,000", "Unit": "sentences", diff --git a/datasets/msda.json b/datasets/msda.json index dde6519..41f7916 100644 --- a/datasets/msda.json +++ b/datasets/msda.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "tweets anotated for sentiment analysis and topic detection", "Volume": "50,000", "Unit": "sentences", diff --git "a/datasets/multilingual_hate\r\nspeech_detection_dataset.json" "b/datasets/multilingual_hate\r\nspeech_detection_dataset.json" index 5b6ce92..9dcf0cd 100644 --- "a/datasets/multilingual_hate\r\nspeech_detection_dataset.json" +++ "b/datasets/multilingual_hate\r\nspeech_detection_dataset.json" @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Combined MLMA and L-HSAB datasets", "Volume": "5,790", "Unit": "sentences", diff --git a/datasets/nadi-2020.json b/datasets/nadi-2020.json index bb19663..e8de235 100644 --- a/datasets/nadi-2020.json +++ b/datasets/nadi-2020.json @@ -136,7 +136,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The data for the shared task covers a total of 100 provinces from 21 Arab countries and are collected from the Twitter domain", "Volume": "30,957", "Unit": "sentences", diff --git a/datasets/nadi-2021.json b/datasets/nadi-2021.json index 7bfea46..bd1095b 100644 --- a/datasets/nadi-2021.json +++ b/datasets/nadi-2021.json @@ -136,7 +136,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The shared task dataset covers a total of 100 provinces from 21 Arab countries, collected from the Twitter domain. ", "Volume": "310,000", "Unit": "sentences", diff --git a/datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json b/datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json index 26422e3..371e8e8 100644 --- a/datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json +++ b/datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json @@ -9,7 +9,7 @@ "Dialect": "Tunisia", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "I have collected more than 40000 comments and posts from facebook. I labeled about 10000 comments (positive / negative/ neutral) You can use it for your researches ! Enjoy :)", "Volume": "40,000", "Unit": "sentences", diff --git a/datasets/named_entities_lexicon.json b/datasets/named_entities_lexicon.json index d58d2a7..18d6619 100644 --- a/datasets/named_entities_lexicon.json +++ b/datasets/named_entities_lexicon.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Named entities (person,organisation and location) Arabic-English Pairs Person 27480 Organization 17237 Location 4036 Overall Arabic-English Pairs 48753", "Volume": "48,753", "Unit": "tokens", diff --git a/datasets/narabizi_corpus.json b/datasets/narabizi_corpus.json index ca3e6fe..76e8fe5 100644 --- a/datasets/narabizi_corpus.json +++ b/datasets/narabizi_corpus.json @@ -9,7 +9,7 @@ "Dialect": "Algeria", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "extension of NArabizi treebank by adding to annotations.", "Volume": "1,500", "Unit": "sentences", diff --git a/datasets/narabizi_treebank.json b/datasets/narabizi_treebank.json index 29a1bbb..695af1e 100644 --- a/datasets/narabizi_treebank.json +++ b/datasets/narabizi_treebank.json @@ -9,7 +9,7 @@ "Dialect": "Algeria", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " fully annotated in morpho-syntax\r\nand Universal Dependency syntax, with full\r\ntranslation at both the word and the sentence\r\nlevels", "Volume": "1,500", "Unit": "sentences", diff --git a/datasets/ne3l__named_entities_arabic_corpus.json b/datasets/ne3l__named_entities_arabic_corpus.json index 88799d0..23b0ccf 100644 --- a/datasets/ne3l__named_entities_arabic_corpus.json +++ b/datasets/ne3l__named_entities_arabic_corpus.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The NE3L project (Named Entities 3 Languages) consisted in annotating several corpora with different languages with named entities. Text format data were extracted from newspapers and deal with various topics. 3 different languages were annotated: Arabic, Chinese and Russian.", "Volume": "103,363", "Unit": "tokens", diff --git a/datasets/nileulex.json b/datasets/nileulex.json index c01bcb8..785dde6 100644 --- a/datasets/nileulex.json +++ b/datasets/nileulex.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Egyptian Arabic and Modern Standard Arabic sentiment words and their polarity", "Volume": "5,953", "Unit": "sentences", diff --git a/datasets/nlp_dataset_for_arabic_dialects.json b/datasets/nlp_dataset_for_arabic_dialects.json index 5054195..226f8f9 100644 --- a/datasets/nlp_dataset_for_arabic_dialects.json +++ b/datasets/nlp_dataset_for_arabic_dialects.json @@ -40,7 +40,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This data was collected from the Twitter social network and consists on +50K\ntwits in five (5) national dialects", "Volume": "52,210", "Unit": "sentences", diff --git a/datasets/oca__opinion_corpus_for_arabic.json b/datasets/oca__opinion_corpus_for_arabic.json index cae3bd0..7c3f527 100644 --- a/datasets/oca__opinion_corpus_for_arabic.json +++ b/datasets/oca__opinion_corpus_for_arabic.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "books", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The corpus contains 500 movie reviews collected from different web pages and blogs in Arabic, 250 of them considered as positive reviews, and the other 250 as negative opinions", "Volume": "500", "Unit": "sentences", diff --git a/datasets/oclar.json b/datasets/oclar.json index cd7ad7e..93b1bc5 100644 --- a/datasets/oclar.json +++ b/datasets/oclar.json @@ -9,7 +9,7 @@ "Dialect": "Lebanon", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Opinion Corpus for Lebanese Arabic Reviews ", "Volume": "3,916", "Unit": "sentences", diff --git a/datasets/offenseval_2020.json b/datasets/offenseval_2020.json index 0928b70..fe8ed22 100644 --- a/datasets/offenseval_2020.json +++ b/datasets/offenseval_2020.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The Arabic dataset consists of 10,000 tweets collected in April\u2013May 2019 using the Twitter API with the language filter set to Arabic: lang:ar.", "Volume": "10,000", "Unit": "sentences", diff --git a/datasets/omcca.json b/datasets/omcca.json index 1f29abf..e1c8992 100644 --- a/datasets/omcca.json +++ b/datasets/omcca.json @@ -22,7 +22,7 @@ "Dialect": "mixed", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Opinion Mining Corpus for Colloquial Variety of Arabic language", "Volume": "28,576", "Unit": "sentences", diff --git a/datasets/ontonotes_5_0.json b/datasets/ontonotes_5_0.json index 5651bc3..e66b200 100644 --- a/datasets/ontonotes_5_0.json +++ b/datasets/ontonotes_5_0.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The Arabic portion of OntoNotes 5.0 includes 300K words of newswire data. ", "Volume": "300,000", "Unit": "tokens", diff --git a/datasets/openiti-proc.json b/datasets/openiti-proc.json index 6ea339b..047904f 100644 --- a/datasets/openiti-proc.json +++ b/datasets/openiti-proc.json @@ -9,7 +9,7 @@ "Dialect": "Classical Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A linguistically annotated version of the OpenITI corpus, with annotations for lemmas, POS tags, parse trees, and morphological segmentation", "Volume": "1,500,000,000", "Unit": "tokens", diff --git a/datasets/opus_ubuntu.json b/datasets/opus_ubuntu.json index 5a89fe8..0bef91e 100644 --- a/datasets/opus_ubuntu.json +++ b/datasets/opus_ubuntu.json @@ -9,7 +9,7 @@ "Dialect": "Classical Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "These are translations of the Ubuntu software package messages, donated by the Ubuntu community.", "Volume": "299", "Unit": "documents", diff --git a/datasets/paad.json b/datasets/paad.json index 09e234b..6f465b6 100644 --- a/datasets/paad.json +++ b/datasets/paad.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "he dataset is 206 articles distributed into three categories as (Reform, Conservative and Revolutionary) that we offer to the research community on Arabiccomputational linguistics.", "Volume": "206", "Unit": "documents", diff --git a/datasets/padic__parallel_arabic_dialect_corpus.json b/datasets/padic__parallel_arabic_dialect_corpus.json index c895c10..b8c80ab 100644 --- a/datasets/padic__parallel_arabic_dialect_corpus.json +++ b/datasets/padic__parallel_arabic_dialect_corpus.json @@ -46,7 +46,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "s composed of about 6400 sentences of dialects from both the Maghreb and the middle east", "Volume": "12,824", "Unit": "sentences", diff --git a/datasets/pan17_author_profiling.json b/datasets/pan17_author_profiling.json index f22a2e0..bbbf433 100644 --- a/datasets/pan17_author_profiling.json +++ b/datasets/pan17_author_profiling.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "We provide you with a training data set that consists of Twitter tweets in English, Spanish, Portuguese and Arabic, labeled with gender and language variety.", "Volume": "4,000", "Unit": "sentences", diff --git a/datasets/pan18_author_profiling.json b/datasets/pan18_author_profiling.json index 09e6924..4bd317f 100644 --- a/datasets/pan18_author_profiling.json +++ b/datasets/pan18_author_profiling.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "We provide you with a training data set that consists of Twitter users labeled with gender. For each author, a total of 100 tweets and 10 images are provided. Authors are grouped by the language of their tweets: English, Arabic and Spanish.", "Volume": "250,000", "Unit": "sentences", diff --git a/datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json b/datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json index 34b1acf..50ba660 100644 --- a/datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json +++ b/datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Each part of the corpus (training and test) consists mainly of 2 datasets: textual files and XML files. The textual files represent the suspicious documents i.e., the documents that contain artificial plagiarism; and the XML files are the plagiarism annotation i.e. they provide for each plagiarized passage its starting offset in the suspicious document and its length (offset and length are both expressed in characters). A suspicious document file and its plagiarism annotation file share the same name.", "Volume": "2,048", "Unit": "documents", diff --git a/datasets/polyglot-ner.json b/datasets/polyglot-ner.json index 2003d69..059d195 100644 --- a/datasets/polyglot-ner.json +++ b/datasets/polyglot-ner.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Polyglot-NER A training dataset automatically generated from Wikipedia and Freebase the task of named entity recognition. The dataset contains the basic Wikipedia based training data for 40 languages we have (with coreference resolution) for the task of named entity recognition. The details of the procedure of generating them is outlined in Section 3 of the paper (https://arxiv.org/abs/1410.3791). Each config contains the data corresponding to a different language. For example, \"es\" includes only spanish examples.", "Volume": "10,000,144", "Unit": "tokens", diff --git a/datasets/qa4mre.json b/datasets/qa4mre.json index 60986f9..e692be8 100644 --- a/datasets/qa4mre.json +++ b/datasets/qa4mre.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "web pages", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "QA4MRE dataset was created for the CLEF 2011/2012/2013 shared tasks to promote research in question answering and reading comprehension. The dataset contains a supporting passage and a set of questions corresponding to the passage. Multiple options for answers are provided for each question, of which only one is correct. The training and test datasets are available for the main track. Additional gold standard documents are available for two pilot studies: one on alzheimers data, and the other on entrance exams data.", "Volume": "160", "Unit": "documents", diff --git a/datasets/qadi_arabic.json b/datasets/qadi_arabic.json index bddc40f..489d268 100644 --- a/datasets/qadi_arabic.json +++ b/datasets/qadi_arabic.json @@ -118,7 +118,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Dialects dataset", "Volume": "540,590", "Unit": "sentences", diff --git a/datasets/qasr.json b/datasets/qasr.json index 45e115e..e0a53e6 100644 --- a/datasets/qasr.json +++ b/datasets/qasr.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This multi-dialect speech dataset contains 2, 000 hours of speech sampled at 16kHz crawled from Aljazeera news channel", "Volume": "2,000", "Unit": "hours", diff --git a/datasets/qatari_heritage_corpus.json b/datasets/qatari_heritage_corpus.json index edb3914..5762460 100644 --- a/datasets/qatari_heritage_corpus.json +++ b/datasets/qatari_heritage_corpus.json @@ -9,7 +9,7 @@ "Dialect": "Qatar", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "qatari heritage expressions dataset with translations", "Volume": "1,000", "Unit": "sentences", diff --git a/datasets/quran_hadith_datasets.json b/datasets/quran_hadith_datasets.json index 973321e..8d1afe8 100644 --- a/datasets/quran_hadith_datasets.json +++ b/datasets/quran_hadith_datasets.json @@ -9,7 +9,7 @@ "Dialect": "Classical Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "The datasets showcase the related and non-related pairs of Quran-Quran and Quran-Hadith. It has Classical Arabic and English translated verses and teachings.", "Volume": "20,360", "Unit": "sentences", diff --git a/datasets/quran_speech__imam_+_users.json b/datasets/quran_speech__imam_+_users.json index d14a191..6f4c487 100644 --- a/datasets/quran_speech__imam_+_users.json +++ b/datasets/quran_speech__imam_+_users.json @@ -9,7 +9,7 @@ "Dialect": "Classical Arabic", "Domain": "transcribed audio", "Form": "spoken", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "7 full Quran recitations + 18K filtered user recitation", "Volume": "61,000", "Unit": "sentences", diff --git a/datasets/religious_hate_speech.json b/datasets/religious_hate_speech.json index a598d57..92ea6e7 100644 --- a/datasets/religious_hate_speech.json +++ b/datasets/religious_hate_speech.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "training dataset contains 5,569 examples, while the testing dataset contains 567 examples collected from twittter", "Volume": "6,136", "Unit": "sentences", diff --git a/datasets/rsac.json b/datasets/rsac.json index c0bdde6..ea2bebe 100644 --- a/datasets/rsac.json +++ b/datasets/rsac.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "reviews", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This dataset contains 6318 hotel reviews collected from the Booking.com website. The reviews are written in both standard and dialectical Arabic and manually annotated as either positive or negative.", "Volume": "8,425", "Unit": "sentences", diff --git a/datasets/sa`7r.json b/datasets/sa`7r.json index bac2443..bbd4907 100644 --- a/datasets/sa`7r.json +++ b/datasets/sa`7r.json @@ -9,7 +9,7 @@ "Dialect": "Saudi Arabia", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The dataset was collected using Twitter API and it consists of 19,810 tweets, 8,089 of them are labeled as ironic tweets.", "Volume": "19,810", "Unit": "sentences", diff --git a/datasets/salma.json b/datasets/salma.json index e9dbfd1..fcc3477 100644 --- a/datasets/salma.json +++ b/datasets/salma.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "web pages", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "SALMA corpus is part of the Wojood corpus (Jarrar et al., 2022), and was collected from 33 online media sources written in Modern Standard Arabic (MSA) and covering general topics.", "Volume": "34,000", "Unit": "tokens", diff --git a/datasets/sanad.json b/datasets/sanad.json index dd0570b..fcd5116 100644 --- a/datasets/sanad.json +++ b/datasets/sanad.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "textual data collected from three news portals", "Volume": "194,797", "Unit": "documents", diff --git a/datasets/semeval-2017_task_4.json b/datasets/semeval-2017_task_4.json index 6283d04..27728bc 100644 --- a/datasets/semeval-2017_task_4.json +++ b/datasets/semeval-2017_task_4.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " a large accessible benchmark dataset containing over 70,000 tweets\nacross two languages", "Volume": "70,000", "Unit": "sentences", diff --git a/datasets/semeval-2018_task_1.json b/datasets/semeval-2018_task_1.json index db58a25..016fa91 100644 --- a/datasets/semeval-2018_task_1.json +++ b/datasets/semeval-2018_task_1.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "it creates a new Affect in Tweets dataset of more than 22,000 tweets such\nthat subsets are annotated for a number of emotion dimensions.", "Volume": "22,000", "Unit": "tokens", diff --git a/datasets/senti_lex.json b/datasets/senti_lex.json index 6a80a1b..39454d8 100644 --- a/datasets/senti_lex.json +++ b/datasets/senti_lex.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This dataset add sentiment lexicons for 81 languages generated via graph propagation based on a knowledge graph--a graphical representation of real-world entities and the links between them", "Volume": "2,794", "Unit": "tokens", diff --git a/datasets/senwave.json b/datasets/senwave.json index bcc9cb2..d517ec5 100644 --- a/datasets/senwave.json +++ b/datasets/senwave.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "The largest fine-grained annotated Covid-19 tweets dataset", "Volume": "10,000", "Unit": "sentences", diff --git a/datasets/senzi.json b/datasets/senzi.json index c77e7e1..830ae43 100644 --- a/datasets/senzi.json +++ b/datasets/senzi.json @@ -9,7 +9,7 @@ "Dialect": "Lebanon", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "by translating, annotating, and transliterating\r\nother resources to have an initial set of 2K sentiment words. We expanded it to 24.6K sentiment\r\nwords by importing inflectional and orthographic\r\nforms using word embeddings", "Volume": "24,600", "Unit": "tokens", diff --git a/datasets/shami.json b/datasets/shami.json index d002e47..072cc10 100644 --- a/datasets/shami.json +++ b/datasets/shami.json @@ -34,7 +34,7 @@ "Dialect": "Levant", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "the first Levantine Dialect Corpus (SDC) covering data from the four dialects spoken in Palestine, Jordan, Lebanon and Syria.", "Volume": "117,805", "Unit": "sentences", diff --git a/datasets/sohateful.json b/datasets/sohateful.json index b0b564a..43e7ce1 100644 --- a/datasets/sohateful.json +++ b/datasets/sohateful.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "70,000 Arabic tweets, from which 15,965 tweets were selected and annotated, to identify hate speech patterns and train classification models", "Volume": "15,965", "Unit": "sentences", diff --git a/datasets/spiral.json b/datasets/spiral.json index db8abee..5f3a66c 100644 --- a/datasets/spiral.json +++ b/datasets/spiral.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "SPIRAL is a corpus dedicated to the detection and correction of spelling errors in MSA Arabic texts.", "Volume": "248,441,892", "Unit": "tokens", diff --git a/datasets/sudanese_dialect_tweets_about_ridesharing_companies.json b/datasets/sudanese_dialect_tweets_about_ridesharing_companies.json index 9038b1a..07f9ed2 100644 --- a/datasets/sudanese_dialect_tweets_about_ridesharing_companies.json +++ b/datasets/sudanese_dialect_tweets_about_ridesharing_companies.json @@ -9,7 +9,7 @@ "Dialect": "Sudan", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Sentiment Analysis dataset collected from Twitter. It contains people's opinions on Sudanese Ridesharing companies.", "Volume": "2,116", "Unit": "sentences", diff --git a/datasets/sudanese_dialect_tweets_about_telecommunication_companies.json b/datasets/sudanese_dialect_tweets_about_telecommunication_companies.json index d7b5071..613d288 100644 --- a/datasets/sudanese_dialect_tweets_about_telecommunication_companies.json +++ b/datasets/sudanese_dialect_tweets_about_telecommunication_companies.json @@ -9,7 +9,7 @@ "Dialect": "Sudan", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Sentiment Analysis dataset written in Sudanese Arabic Dialect", "Volume": "4,712", "Unit": "sentences", diff --git a/datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json b/datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json index a1282f5..212ccb8 100644 --- a/datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json +++ b/datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json @@ -9,7 +9,7 @@ "Dialect": "Sudan", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "it is pre processed dataset from Twitter about Telecom companies in Sudan, it labelled by 3 different labels from different age, gender and background", "Volume": "5,349", "Unit": "sentences", diff --git a/datasets/syria_tweets_sentiment_corpus.json b/datasets/syria_tweets_sentiment_corpus.json index 14ccbeb..bdd60f0 100644 --- a/datasets/syria_tweets_sentiment_corpus.json +++ b/datasets/syria_tweets_sentiment_corpus.json @@ -9,7 +9,7 @@ "Dialect": "Syria", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A dataset of 2000 tweets originating from Syria", "Volume": "2,000", "Unit": "sentences", diff --git a/datasets/tarc.json b/datasets/tarc.json index 9729b1d..4621719 100644 --- a/datasets/tarc.json +++ b/datasets/tarc.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "flexible and multi-purpose open corpus in order to be a useful support for different types of analyses: computational and linguistics, as well as for NLP tools training", "Volume": "4,790", "Unit": "sentences", diff --git a/datasets/tead.json b/datasets/tead.json index a4db495..9510505 100644 --- a/datasets/tead.json +++ b/datasets/tead.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "dataset for Arabic Sentiment Analysis", "Volume": "6,000,000", "Unit": "sentences", diff --git a/datasets/ted_talks_corpus_(wit3).json b/datasets/ted_talks_corpus_(wit3).json index a0c56bf..f616ea0 100644 --- a/datasets/ted_talks_corpus_(wit3).json +++ b/datasets/ted_talks_corpus_(wit3).json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "transcribed audio", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": " The TED Talks corpus contains transcribed and translated TED Talks from 82+ languages. It is curated for use in machine translation and natural language processing research, providing a valuable multilingual parallel corpus.", "Volume": "17,000", "Unit": "sentences", diff --git a/datasets/the_nine_books_of_arabic_hadith.json b/datasets/the_nine_books_of_arabic_hadith.json index b7c9cdf..2012ed9 100644 --- a/datasets/the_nine_books_of_arabic_hadith.json +++ b/datasets/the_nine_books_of_arabic_hadith.json @@ -9,7 +9,7 @@ "Dialect": "Classical Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "There are two files of Hadith, the first one for all hadith With Tashkil and Without Tashkel from the Nine Books that are 62,169 Hadith. The second one it Hadith pre-processing data, which is applyed normalization and removeing stop words and lemmatization on it\n\n", "Volume": "62,169", "Unit": "documents", diff --git a/datasets/toxi-text-3m.json b/datasets/toxi-text-3m.json index 3019af6..d7f8974 100644 --- a/datasets/toxi-text-3m.json +++ b/datasets/toxi-text-3m.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "This is a large multilingual toxicity dataset with 3M rows of text data from 55 natural languages, all of which are written/sent by humans, not machine translation models.", "Volume": "51,852", "Unit": "sentences", diff --git a/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json index a828222..2a7d591 100644 --- a/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json +++ b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "This is a parallel corpus of 10,000 words in Arabic and a reference translation in English. The source texts are emails collected from Wikiar-I, a mailing list for discussions about the Arabic Wikipedia. The collected emails are dated from 2004 to 2007. The translation has been conducted following a strict protocol aimed at producing high quality translations.", "Volume": "10,000", "Unit": "tokens", diff --git a/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json index 0948504..697f451 100644 --- a/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json +++ b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "This is a parallel corpus of 10,000 words in Arabic and 2 reference translations in English. The source texts are emails collected from Wikiar-I, a mailing list for discussions about the Arabic Wikipedia. The collected emails are dated from 2010 to 2012. The translation has been conducted by two different translation teams following a strict protocol aimed at producing high quality translations.", "Volume": "10,000", "Unit": "tokens", diff --git a/datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json b/datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json index e78b09a..02feaf8 100644 --- a/datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json +++ b/datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "This is a parallel corpus of 10,000 words in Arabic and 2 reference translations in English. The source texts are articles collected in 2012 from the Arabic version of Le Monde Diplomatique. The translation has been conducted by two different translation teams following a strict protocol aimed at producing high quality translations.", "Volume": "10,000", "Unit": "tokens", diff --git a/datasets/troll_detection.json b/datasets/troll_detection.json index b1504de..14d986e 100644 --- a/datasets/troll_detection.json +++ b/datasets/troll_detection.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Trolls detection in Tweets", "Volume": "128", "Unit": "sentences", diff --git a/datasets/tsac.json b/datasets/tsac.json index 71f6e45..26ed225 100644 --- a/datasets/tsac.json +++ b/datasets/tsac.json @@ -9,7 +9,7 @@ "Dialect": "Tunisia", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "About 17k user comments manually annotated to positive and negative polarities. This corpus is collected from Facebook users comments written on official pages of Tunisian radios and TV channels", "Volume": "17,000", "Unit": "sentences", diff --git a/datasets/tufs_media.json b/datasets/tufs_media.json index d467893..8894d16 100644 --- a/datasets/tufs_media.json +++ b/datasets/tufs_media.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "a parallel corpus of translated news articles collected at Tokyo University of Foreign Studies (TUFS)", "Volume": "8,652", "Unit": "sentences", diff --git a/datasets/tunizi.json b/datasets/tunizi.json index b8ad34e..469d9b8 100644 --- a/datasets/tunizi.json +++ b/datasets/tunizi.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "first Tunisian Arabizi Dataset including 3K sentences, balanced, covering different topics, preprocessed and annotated as positive and negative", "Volume": "3,000", "Unit": "sentences", diff --git a/datasets/twifil.json b/datasets/twifil.json index 1e54b0f..4cb7bae 100644 --- a/datasets/twifil.json +++ b/datasets/twifil.json @@ -9,7 +9,7 @@ "Dialect": "Algeria", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "An Algerian dialect dataset annotated for both sentiment (9,000 tweets), emotion (about 5,000 tweets) and extra-linguistic information including author profiling (age and gender)", "Volume": "14,000", "Unit": "sentences", diff --git a/datasets/twt15da_lists.json b/datasets/twt15da_lists.json index 009a183..7f7e529 100644 --- a/datasets/twt15da_lists.json +++ b/datasets/twt15da_lists.json @@ -100,7 +100,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(translation)", + "Collection Style": "crawling,annotation,machine translation", "Description": "The annotated dialectal Arabic corpus (Twt15DA) is collected from Twitter and consists of 311,785 tweets containing 3,858,459 words in total. They randomly selected a sample of 75 tweets per country, 1125 tweets in total, and conducted a manual dialect identification task by native speakers.", "Volume": "311,785", "Unit": "sentences", diff --git a/datasets/tydiqa.json b/datasets/tydiqa.json index 47afdf0..4bc78c3 100644 --- a/datasets/tydiqa.json +++ b/datasets/tydiqa.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "question answering dataset covering 11 typologically diverse languages with 200K question-answer pairs", "Volume": "25,893", "Unit": "sentences", diff --git a/datasets/understanding_and_detecting_dangerous_speech_in_social_media.json b/datasets/understanding_and_detecting_dangerous_speech_in_social_media.json index 0b6b9a6..98befb0 100644 --- a/datasets/understanding_and_detecting_dangerous_speech_in_social_media.json +++ b/datasets/understanding_and_detecting_dangerous_speech_in_social_media.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "social media", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Dangerous speech detection", "Volume": "5,000", "Unit": "sentences", diff --git a/datasets/wikiann.json b/datasets/wikiann.json index 50531ff..6bf7196 100644 --- a/datasets/wikiann.json +++ b/datasets/wikiann.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": " Both name tagging and linking results for 282 languages are promising on Wikipedia data and on-Wikipedia\r\ndata", "Volume": "185,000", "Unit": "tokens", diff --git a/datasets/wikiann_ner(mmner).json b/datasets/wikiann_ner(mmner).json index 83873a1..36fcf7b 100644 --- a/datasets/wikiann_ner(mmner).json +++ b/datasets/wikiann_ner(mmner).json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "wikipedia", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Cross-lingual name tagging and linking for 282 languages", "Volume": "30,000", "Unit": "tokens", diff --git a/datasets/wojood.json b/datasets/wojood.json index 3257b19..a70a6fd 100644 --- a/datasets/wojood.json +++ b/datasets/wojood.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Wojood consists of about 550K tokens (MSA and dialect) that are manually annotated with 21 entity types (e.g., person, organization, location, event, date, etc). It covers multiple domains and was annotated with nested entities. The corpus contains about 75K entities and 22.5% of which are nested.", "Volume": "550,464", "Unit": "tokens", diff --git a/datasets/wojoodfine.json b/datasets/wojoodfine.json index 0dc5ff7..abc4200 100644 --- a/datasets/wojoodfine.json +++ b/datasets/wojoodfine.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "WojoodFine is an extension of Wojood and consists of about 550K tokens (MSA and dialect) that are manually annotated with 21 entity types and four main entity types in Wojood (GPE, LOC, ORG, and FAC) are annotated with 31 new fine-grained subtypes. It covers multiple domains and was annotated with nested entities. The corpus contains about 75K entities and 22.5% of which are nested. A nested named entity recognition (NER) model based on BERT was trained (F1-score 92.29.4%).", "Volume": "550,000", "Unit": "tokens", diff --git a/datasets/wojoodgaza.json b/datasets/wojoodgaza.json index 33874da..e6dcf52 100644 --- a/datasets/wojoodgaza.json +++ b/datasets/wojoodgaza.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "It is an extestion of Wojood. It is about the ongoing Israeli War on Gaza, based on the assumption that discourse about recent global events will involve mentions from different data distributions. The dataset is collected from five news domains related to the War (Health, Economics, Finance, Politics, and Law). It consists of 60k tokens, divided into a test set (50k) and a development set (10k), with the domains evenly distributed.y. It is manually annotated with fine-grained named entities, following the same annotation guidelines as WojoodFine.", "Volume": "60,000", "Unit": "tokens", diff --git a/datasets/wojoodhadath.json b/datasets/wojoodhadath.json index 5d4b989..21a09dc 100644 --- a/datasets/wojoodhadath.json +++ b/datasets/wojoodhadath.json @@ -9,7 +9,7 @@ "Dialect": "mixed", "Domain": "news articles", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "Extends the Wojood dataset by incorporating relations into Wojood's nested structure. The added relations include hasAgent, hasLocation, and hasDate. The dataset, provided in JSON format, consists of sentences, each containing one or more events along with their corresponding arguments. It is divided into three subsets: training, validation, and test.", "Volume": "550,000", "Unit": "tokens", diff --git a/datasets/wsd.json b/datasets/wsd.json index 17e77fc..e4a5284 100644 --- a/datasets/wsd.json +++ b/datasets/wsd.json @@ -9,7 +9,7 @@ "Dialect": "Modern Standard Arabic", "Domain": "other", "Form": "text", - "Collection Style": "crawling and annotation(other)", + "Collection Style": "crawling,annotation", "Description": "A dataset for Arabic Word Sense Disambiguation (WSD) consisting of 3670 labeled examples of 100 polysemous Arabic words. It provides multiple senses for each word, annotated with real-world and GPT-generated sentences.", "Volume": "3,670", "Unit": "sentences",