From dd11107a28a393241cebf39dd46b2d4dada827ed Mon Sep 17 00:00:00 2001 From: zaidalyafeai Date: Fri, 20 Dec 2024 19:55:17 +0300 Subject: [PATCH] add individual jsons instead of full data --- .../101_billion_arabic_words_dataset.json | 36 ++++ ...993-2007_united_nations_parallel_text.json | 36 ++++ datasets/1997_hub5_arabic_evaluation.json | 36 ++++ datasets/1997_hub5_arabic_transcripts.json | 36 ++++ ..._nist_language_recognition_evaluation.json | 36 ++++ ...st_rich_transcription_evaluation_data.json | 36 ++++ ...aker_recognition_evaluation_test_data.json | 36 ++++ ..._recognition_evaluation_training_data.json | 36 ++++ ...06_conll_shared_task_-_arabic_&_czech.json | 36 ++++ ...ecognition_evaluation_test_set_part_1.json | 36 ++++ ...ecognition_evaluation_test_set_part_2.json | 36 ++++ ...r_recognition_evaluation_training_set.json | 36 ++++ ...spoken_term_detection_development_set.json | 36 ++++ ..._spoken_term_detection_evaluation_set.json | 36 ++++ ..._conll_shared_task_-_arabic_&_english.json | 36 ++++ ..._evaluation_supplemental_training_set.json | 36 ++++ ...guage_recognition_evaluation_test_set.json | 36 ++++ ...ion_(metricsmatr)_gale_evaluation_set.json | 36 ++++ ...tion_(metricsmatr08)_development_data.json | 36 ++++ ...eaker_recognition_evaluation_test_set.json | 36 ++++ ...nition_evaluation_training_set_part_1.json | 36 ++++ ...nition_evaluation_training_set_part_2.json | 36 ++++ ...guage_recognition_evaluation_test_set.json | 36 ++++ ...eaker_recognition_evaluation_test_set.json | 36 ++++ datasets/a-speechdb.json | 36 ++++ datasets/a7'ta.json | 36 ++++ ...20th_centuries)_for_stylometric_tests.json | 36 ++++ datasets/absa-hotels.json | 36 ++++ ...ace_2004_multilingual_training_corpus.json | 36 ++++ ...ace_2005_multilingual_training_corpus.json | 36 ++++ ...ace_2007_multilingual_training_corpus.json | 36 ++++ datasets/acqad.json | 36 ++++ datasets/adcc.json | 36 ++++ datasets/adi-17.json | 139 +++++++++++++++ datasets/adi-5.json | 67 +++++++ datasets/adpbc.json | 36 ++++ ...bic_twitter__analysis_and_experiments.json | 36 ++++ datasets/afewc.json | 36 ++++ datasets/afnd.json | 36 ++++ .../afrd__arabic_fake_reviews_detection.json | 36 ++++ datasets/aghlat.json | 36 ++++ datasets/ags.json | 36 ++++ datasets/ai_society_translated.json | 36 ++++ datasets/ajdir_corpora.json | 36 ++++ datasets/ajgt.json | 36 ++++ datasets/akec.json | 36 ++++ datasets/al-hayat_arabic_corpus.json | 36 ++++ datasets/alc__arabic_learner_corpus.json | 36 ++++ datasets/aljazeera-dialectal_speech.json | 36 ++++ datasets/aljazeera_deleted_comments.json | 36 ++++ .../alr__arabic_laptop_reviews_dataset.json | 36 ++++ .../alriyadh-newspaper-covid-dataset.json | 36 ++++ datasets/alue.json | 36 ++++ datasets/amara.json | 36 ++++ datasets/an-nahar_newspaper_text_corpus.json | 36 ++++ .../anad__arabic_natural_audio_dataset.json | 36 ++++ datasets/anercorp.json | 36 ++++ datasets/anetac.json | 36 ++++ datasets/annotated_shami_corpus.json | 36 ++++ ...corpus_in_arabizi,_french_and_english.json | 36 ++++ datasets/ans_corpus___claim_verification.json | 43 +++++ datasets/antcorpus.json | 36 ++++ ...ial_behaviour_in_online_communication.json | 36 ++++ datasets/aoc-aldi.json | 36 ++++ datasets/aoc.json | 49 ++++++ datasets/apcd.json | 36 ++++ datasets/apcd2.json | 36 ++++ ...0__arabic_parallel_gender_corpus_v1_0.json | 36 ++++ ...0__arabic_parallel_gender_corpus_v2_0.json | 36 ++++ .../aqad__arabic_question-answer_dataset.json | 36 ++++ datasets/aqmar.json | 36 ++++ datasets/aqqac.json | 36 ++++ datasets/ar-asag.json | 36 ++++ ...ord_embeddings_for_sentiment_analysis.json | 36 ++++ datasets/ara-timebank.json | 36 ++++ datasets/arab-acquis.json | 36 ++++ datasets/arab-andalusian_music_corpus.json | 36 ++++ datasets/arab-esl.json | 36 ++++ .../arab_states_analogy_dataset_(asad).json | 36 ++++ datasets/arabceleb.json | 36 ++++ datasets/arabench.json | 36 ++++ datasets/arabglossbert.json | 36 ++++ datasets/arabic-dataset-for-capt.json | 36 ++++ .../arabic-dialect_english_parallel_text.json | 43 +++++ ...arabic-english_named_entities_dataset.json | 36 ++++ ...abic-hebrew_ted_talks_parallel_corpus.json | 36 ++++ ...bic-multi-classification-dataset-amcd.json | 36 ++++ datasets/arabic-news.json | 36 ++++ datasets/arabic-ocr.json | 36 ++++ datasets/arabic-openhermes-2_5.json | 36 ++++ datasets/arabic-poem-emotion.json | 36 ++++ datasets/arabic-stories-corpus.json | 36 ++++ ..._egyptian_comparable_wikipedia_corpus.json | 36 ++++ datasets/arabic_100k_reviews.json | 36 ++++ datasets/arabic_ala_lc__romanization.json | 36 ++++ datasets/arabic_analogy.json | 36 ++++ datasets/arabic_billion_words.json | 36 ++++ datasets/arabic_broadcast_news_speech.json | 36 ++++ .../arabic_broadcast_news_transcripts.json | 36 ++++ datasets/arabic_business_corpora.json | 36 ++++ datasets/arabic_common_voice.json | 36 ++++ ...ne_fisher_training_data_set_3,_speech.json | 36 ++++ ...sher_training_data_set_3,_transcripts.json | 36 ++++ datasets/arabic_dev2.json | 36 ++++ datasets/arabic_dialects_dataset.json | 67 +++++++ .../arabic_dictionary_of_inflected_words.json | 36 ++++ ...rabic_document_classification_dataset.json | 36 ++++ datasets/arabic_empathetic_dialogues.json | 36 ++++ .../arabic_english_parallel_news_part_1.json | 36 ++++ datasets/arabic_flood_twitter_dataset.json | 36 ++++ datasets/arabic_gigaword.json | 36 ++++ datasets/arabic_gigaword_fifth_edition.json | 36 ++++ datasets/arabic_gigaword_fourth_edition.json | 36 ++++ datasets/arabic_gigaword_second_edition.json | 36 ++++ datasets/arabic_gigaword_third_edition.json | 36 ++++ .../arabic_hate_speech_2022_shared_task.json | 36 ++++ .../arabic_infectious_disease_ontology.json | 36 ++++ datasets/arabic_keyphrase_dataset.json | 36 ++++ datasets/arabic_morphological_dictionary.json | 36 ++++ datasets/arabic_named_entities.json | 36 ++++ datasets/arabic_named_entity_gazetteer.json | 36 ++++ datasets/arabic_natural_audio_dataset.json | 36 ++++ datasets/arabic_news_articles.json | 36 ++++ ...abic_news_articles_from_aljazeera_net.json | 36 ++++ datasets/arabic_news_dataset_about_hajj.json | 36 ++++ .../arabic_news_translation_text_part_1.json | 36 ++++ datasets/arabic_news_tweets.json | 36 ++++ ...wswire_english_translation_collection.json | 36 ++++ datasets/arabic_newswire_part_1.json | 36 ++++ datasets/arabic_ontology.json | 36 ++++ ...osact4___offensive_language_detection.json | 36 ++++ .../arabic_osact5___arabic_hate_speech.json | 36 ++++ datasets/arabic_oscar.json | 36 ++++ datasets/arabic_pos_dialect.json | 61 +++++++ datasets/arabic_punctuation_dataset.json | 36 ++++ datasets/arabic_rc_datasets.json | 36 ++++ datasets/arabic_satire_dataset.json | 36 ++++ .../arabic_satirical_fake_news_dataset.json | 36 ++++ datasets/arabic_senti-lexicon.json | 36 ++++ datasets/arabic_sentiment_lexicons.json | 36 ++++ datasets/arabic_sentiment_twitter_corpus.json | 36 ++++ datasets/arabic_sms_chat.json | 36 ++++ datasets/arabic_spam_and_ham_tweets.json | 36 ++++ datasets/arabic_speech_commands_dataset.json | 36 ++++ datasets/arabic_speech_corpus.json | 36 ++++ ..._recognition_pronunciation_dictionary.json | 36 ++++ datasets/arabic_stop_words.json | 36 ++++ datasets/arabic_text_diacritization.json | 36 ++++ .../arabic_textual_entailment_dataset.json | 36 ++++ ...arabic_treebank_-_broadcast_news_v1_0.json | 36 ++++ datasets/arabic_treebank_-_weblog.json | 36 ++++ ...part_1_-_10k-word_english_translation.json | 36 ++++ datasets/arabic_treebank__part_1_v_2_0.json | 36 ++++ ...ll_vocalization_+_syntactic_analysis).json | 36 ++++ datasets/arabic_treebank__part_1_v_4_1.json | 36 ++++ datasets/arabic_treebank__part_2_v_2_0.json | 36 ++++ datasets/arabic_treebank__part_2_v_3_1.json | 36 ++++ datasets/arabic_treebank__part_3.json | 36 ++++ datasets/arabic_treebank__part_3_v_1_0.json | 36 ++++ datasets/arabic_treebank__part_3_v_3_2.json | 36 ++++ ...eebank__part_4_v_1_0_(mpg_annotation).json | 36 ++++ ...abic_tweets_about_infectious_diseases.json | 36 ++++ datasets/arabic_wiki_data_dump_2018.json | 36 ++++ datasets/arabic_wikipedia_20230101_bots.json | 36 ++++ .../arabic_wikipedia_20230101_nobots.json | 36 ++++ datasets/arabic_wikipedia_talk_pages.json | 36 ++++ .../arabic_wikireading_and_kaiflematha.json | 36 ++++ datasets/arabicaqa.json | 36 ++++ datasets/arabichatespeechdataset.json | 36 ++++ datasets/arabicmmlu.json | 36 ++++ datasets/arabicqa_2_1m.json | 36 ++++ datasets/arabicsa.json | 36 ++++ datasets/arabicsenamticsimilaritydataset.json | 36 ++++ datasets/arabicweb16.json | 36 ++++ datasets/arabicweb24.json | 36 ++++ ...se_of_arabic_general_vocabulary_(dag).json | 36 ++++ datasets/arabscribe.json | 36 ++++ datasets/aracovid19-mfh.json | 36 ++++ ...ntiment_and_sarcasm_detection_dataset.json | 36 ++++ datasets/aracust.json | 36 ++++ datasets/aradata.json | 79 +++++++++ datasets/arafacts.json | 36 ++++ datasets/aranews.json | 36 ++++ datasets/aranpcc.json | 36 ++++ datasets/arap-tweet_corpus.json | 36 ++++ datasets/arasencorpus.json | 36 ++++ datasets/arasenti.json | 36 ++++ datasets/araspider.json | 36 ++++ datasets/arastance.json | 36 ++++ datasets/arasum_corpus.json | 36 ++++ datasets/arbanking77.json | 36 ++++ datasets/arc-wmi.json | 36 ++++ datasets/arcd.json | 36 ++++ ...ys_of_coronavirus_(covid-19)_pandemic.json | 36 ++++ datasets/arcov-19.json | 36 ++++ datasets/arcov19-rumors.json | 36 ++++ datasets/arcovidvac.json | 36 ++++ datasets/areej.json | 36 ++++ datasets/arentail.json | 36 ++++ datasets/arl_arabic_dependency_treebank.json | 36 ++++ datasets/armath.json | 36 ++++ .../armi__arabic_misogynistic_dataset.json | 36 ++++ datasets/arparallel.json | 36 ++++ ...aphrase_identification_in_arabic_text.json | 36 ++++ datasets/arpod.json | 73 ++++++++ ...stion_identification_in_arabic_tweets.json | 36 ++++ datasets/arquad.json | 36 ++++ datasets/arsarcasm-v2.json | 67 +++++++ datasets/arsarcasm.json | 67 +++++++ datasets/arsarcasmoji.json | 36 ++++ datasets/arsas.json | 36 ++++ datasets/arsen-20.json | 36 ++++ datasets/arsenl.json | 36 ++++ datasets/arsentd-lev.json | 36 ++++ datasets/arsentiment.json | 36 ++++ datasets/arsl21l.json | 36 ++++ datasets/artest.json | 36 ++++ datasets/artrivia.json | 36 ++++ datasets/arvox.json | 36 ++++ datasets/arwiki.json | 36 ++++ datasets/arzen-multigenre.json | 36 ++++ datasets/asad.json | 36 ++++ datasets/asayar.json | 36 ++++ datasets/ashaar.json | 36 ++++ datasets/askfm.json | 36 ++++ datasets/astad.json | 36 ++++ datasets/astd.json | 36 ++++ datasets/at-odtsa.json | 36 ++++ datasets/atar.json | 36 ++++ datasets/athar.json | 36 ++++ datasets/attimam.json | 36 ++++ datasets/author_attribution_tweets.json | 36 ++++ datasets/autotweet.json | 36 ++++ datasets/aya_dataset.json | 36 ++++ datasets/ayatec.json | 36 ++++ datasets/baec.json | 55 ++++++ datasets/baladi_lebanese_dialect_corpora.json | 36 ++++ datasets/baved.json | 36 ++++ ...vantine_arabic_speech_and_transcripts.json | 36 ++++ datasets/bbn_blog_posts_sentiment_corpus.json | 36 ++++ datasets/belebele.json | 73 ++++++++ datasets/bible_para.json | 36 ++++ datasets/bnl_historical_newspapers.json | 36 ++++ ...scussion_forum_parallel_training_data.json | 36 ++++ datasets/bolt_arabic_discussion_forums.json | 36 ++++ ...lignment_--_discussion_forum_training.json | 36 ++++ ...versational_telephone_speech_training.json | 36 ++++ ...h_word_alignment_--_sms_chat_training.json | 36 ++++ ...,_and_conversational_telephone_speech.json | 36 ++++ ...,_and_conversational_telephone_speech.json | 36 ++++ ...rabic_sms_chat_parallel_training_data.json | 36 ++++ ...ank_-_conversational_telephone_speech.json | 36 ++++ ...an_arabic_treebank_-_discussion_forum.json | 36 ++++ ...t_egyptian_arabic_treebank_-_sms_chat.json | 36 ++++ ...comprehensive_training_and_evaluation.json | 36 ++++ datasets/botta.json | 36 ++++ datasets/brad_1_0.json | 36 ++++ datasets/brad_2_0.json | 36 ++++ ...ic_morphological_analyzer_version_1_0.json | 36 ++++ ...ic_morphological_analyzer_version_2_0.json | 36 ++++ datasets/calima-glf.json | 36 ++++ datasets/callfriend_egyptian_arabic.json | 36 ++++ ...friend_egyptian_arabic_second_edition.json | 36 ++++ ...tian_arabic_speech_translation_corpus.json | 36 ++++ datasets/callhome_egyptian_arabic_speech.json | 36 ++++ ...ome_egyptian_arabic_speech_supplement.json | 36 ++++ .../callhome_egyptian_arabic_transcripts.json | 36 ++++ ...gyptian_arabic_transcripts_supplement.json | 36 ++++ datasets/calliar.json | 36 ++++ datasets/calyou.json | 36 ++++ datasets/cameltb__camel_treebank_1_0.json | 36 ++++ datasets/canercorpus.json | 36 ++++ datasets/cc-100.json | 36 ++++ datasets/ccaligned.json | 36 ++++ datasets/ccmatrix.json | 36 ++++ datasets/ceap.json | 36 ++++ datasets/checkthat-ar.json | 36 ++++ ...childes_egyptian_arabic_salama_corpus.json | 36 ++++ .../ciad__corpus_of_iraqi_arabic_dialect.json | 36 ++++ datasets/cidar.json | 36 ++++ ...lassical_arabic_text-to-speech_corpus.json | 36 ++++ datasets/classical_arabic_dictionary.json | 36 ++++ datasets/cleananercorp.json | 36 ++++ ...nal_orthography_for_dialectal_arabic).json | 36 ++++ datasets/commonlanguage.json | 36 ++++ datasets/commonsense_validation.json | 36 ++++ datasets/comparable_wikipedia_coprus.json | 49 ++++++ ..._chinese,_japanese,_korean_and_arabic.json | 36 ++++ ...corpus_for_moroccan_arabic_processing.json | 36 ++++ datasets/coronavirus.json | 36 ++++ ...n_arabic_and_gulf_arabic_from_twitter.json | 49 ++++++ ...orpus_of_offensive_language_in_arabic.json | 36 ++++ datasets/covid-19-arabic-tweets-dataset.json | 36 ++++ ...vid-19_disinformation_twitter_dataset.json | 36 ++++ datasets/covid-fakes.json | 36 ++++ datasets/covost_2.json | 36 ++++ datasets/cqa-md__semeval-2016_task_3.json | 36 ++++ datasets/cross-lingual_ner.json | 36 ++++ datasets/crosssum.json | 36 ++++ datasets/cslu__22_languages_corpus.json | 36 ++++ .../ctab__corpus_of_tunisian_arabizi.json | 36 ++++ datasets/culturax.json | 36 ++++ datasets/curras.json | 36 ++++ datasets/daict.json | 36 ++++ datasets/dares.json | 36 ++++ datasets/dart.json | 67 +++++++ datasets/database_of_arab_names.json | 36 ++++ datasets/database_of_arabic_plurals.json | 36 ++++ .../database_of_foreign_names_in_arabic.json | 36 ++++ .../dataset_for_arabic_classification.json | 36 ++++ ...ataset_for_evaluating_root_extraction.json | 36 ++++ ..._arabic_why_question_answering_system.json | 36 ++++ datasets/defarabicqa.json | 36 ++++ ...ikipedia_template-translated_articles.json | 36 ++++ ...alectal_arabic_code-switching_dataset.json | 36 ++++ datasets/dialex.json | 67 +++++++ datasets/disease_ner.json | 36 ++++ ...ctionary_french_arabic,_arabic_french.json | 36 ++++ datasets/doda__darija_open_dataset.json | 36 ++++ datasets/dzdc12.json | 36 ++++ datasets/dziribert.json | 36 ++++ datasets/easc.json | 36 ++++ datasets/edgad.json | 36 ++++ .../egyptian_arabic_wikipedia_20230101.json | 36 ++++ .../egyptian_colloquial_arabic_lexicon.json | 36 ++++ datasets/elecmorocco2016.json | 36 ++++ datasets/emoji-sentiment-dataset.json | 36 ++++ datasets/emotional-tone.json | 36 ++++ datasets/english-arabic_treebank_v_1_0.json | 36 ++++ datasets/everyayah.json | 36 ++++ datasets/evetar.json | 36 ++++ datasets/exams.json | 36 ++++ ...ational_telephone_speech,_transcripts.json | 36 ++++ ...rabic_conversational_telephone_speech.json | 36 ++++ datasets/flodusta.json | 36 ++++ datasets/flores-101.json | 36 ++++ ...ned_treebank_--_broadcast_news_part_1.json | 36 ++++ ...ned_treebank_--_broadcast_news_part_2.json | 36 ++++ ...parallel_aligned_treebank_--_newswire.json | 36 ++++ ...llel_aligned_treebank_--_web_training.json | 36 ++++ ...lignment_--_broadcast_training_part_1.json | 36 ++++ ...lignment_--_broadcast_training_part_2.json | 36 ++++ ...t_training_part_1_--_newswire_and_web.json | 36 ++++ ...alignment_training_part_2_--_newswire.json | 36 ++++ ...word_alignment_training_part_3_--_web.json | 36 ++++ ...ale_phase_1_arabic_blog_parallel_text.json | 36 ++++ ...broadcast_news_parallel_text_-_part_1.json | 36 ++++ ...broadcast_news_parallel_text_-_part_2.json | 36 ++++ ...abic_newsgroup_parallel_text_-_part_1.json | 36 ++++ ...abic_newsgroup_parallel_text_-_part_2.json | 36 ++++ .../gale_phase_1_distillation_training.json | 36 ++++ ...ast_conversation_parallel_text_part_1.json | 36 ++++ ...ast_conversation_parallel_text_part_2.json | 36 ++++ ..._broadcast_conversation_speech_part_1.json | 36 ++++ ..._broadcast_conversation_speech_part_2.json | 36 ++++ ...dcast_conversation_transcripts_part_1.json | 36 ++++ ...dcast_conversation_transcripts_part_2.json | 36 ++++ ...2_arabic_broadcast_news_parallel_text.json | 36 ++++ ...2_arabic_broadcast_news_speech_part_1.json | 36 ++++ ...2_arabic_broadcast_news_speech_part_2.json | 36 ++++ ...bic_broadcast_news_transcripts_part_1.json | 36 ++++ ...bic_broadcast_news_transcripts_part_2.json | 36 ++++ ...phase_2_arabic_newswire_parallel_text.json | 36 ++++ ...gale_phase_2_arabic_web_parallel_text.json | 36 ++++ ..._broadcast_conversation_parallel_text.json | 36 ++++ ...4_arabic_broadcast_news_parallel_text.json | 36 ++++ ...3_and_4_arabic_newswire_parallel_text.json | 36 ++++ ...hase_3_and_4_arabic_web_parallel_text.json | 36 ++++ ..._broadcast_conversation_speech_part_1.json | 36 ++++ ..._broadcast_conversation_speech_part_2.json | 36 ++++ ...dcast_conversation_transcripts_part_1.json | 36 ++++ ...dcast_conversation_transcripts_part_2.json | 36 ++++ ...3_arabic_broadcast_news_speech_part_1.json | 36 ++++ ...3_arabic_broadcast_news_speech_part_2.json | 36 ++++ ...bic_broadcast_news_transcripts_part_1.json | 36 ++++ ...bic_broadcast_news_transcripts_part_2.json | 36 ++++ ...dcast_conversation_parallel_sentences.json | 36 ++++ ..._arabic_broadcast_conversation_speech.json | 36 ++++ ...ic_broadcast_conversation_transcripts.json | 36 ++++ ...bic_broadcast_news_parallel_sentences.json | 36 ++++ ..._phase_4_arabic_broadcast_news_speech.json | 36 ++++ ...e_4_arabic_broadcast_news_transcripts.json | 36 ++++ ..._4_arabic_newswire_parallel_sentences.json | 36 ++++ ...se_4_arabic_weblog_parallel_sentences.json | 36 ++++ datasets/gem.json | 36 ++++ datasets/gem_-_wikilingua.json | 36 ++++ datasets/gem_-_xlsum.json | 36 ++++ datasets/geowac.json | 133 ++++++++++++++ datasets/glare.json | 36 ++++ datasets/gnome.json | 36 ++++ datasets/goud-sum.json | 36 ++++ ...ational_telephone_speech,_transcripts.json | 36 ++++ ...rabic_conversational_telephone_speech.json | 36 ++++ datasets/gumar.json | 85 +++++++++ datasets/haad.json | 36 ++++ datasets/habibi.json | 73 ++++++++ datasets/hard.json | 36 ++++ datasets/hc_corpora.json | 36 ++++ datasets/hijja.json | 36 ++++ ...rks_of_selected_openmt08_09_sentences.json | 36 ++++ datasets/idat.json | 36 ++++ datasets/idrisi-r.json | 61 +++++++ datasets/inaracorpus.json | 36 ++++ datasets/infopankki_v1.json | 36 ++++ datasets/international_corpus_of_arabic.json | 36 ++++ ...ational_telephone_speech,_transcripts.json | 36 ++++ ...rabic_conversational_telephone_speech.json | 36 ++++ .../isarcasmeval__semeval-2022_task_6.json | 36 ++++ ...automatically_extracted_parallel_text.json | 36 ++++ ...dialogues_corpus_for_egyptian_dialect.json | 36 ++++ datasets/journalists_questions.json | 36 ++++ datasets/kacst.json | 36 ++++ datasets/kalamdz.json | 36 ++++ datasets/kalimat.json | 36 ++++ datasets/kawarith.json | 36 ++++ datasets/kde4.json | 36 ++++ datasets/khaleej-2004.json | 36 ++++ datasets/khalidalt_tydiqa-goldp.json | 36 ++++ datasets/khawas.json | 36 ++++ datasets/kind.json | 36 ++++ datasets/ksaa-rd_dataset.json | 36 ++++ datasets/ksu_rich_arabic_speech_database.json | 36 ++++ datasets/ksucca_corpus.json | 36 ++++ datasets/ksuemotions.json | 36 ++++ datasets/kunuz.json | 36 ++++ datasets/l-hsab.json | 36 ++++ datasets/labr.json | 36 ++++ datasets/lama.json | 36 ++++ datasets/language_identification.json | 36 ++++ ...guage_understanding_annotation_corpus.json | 36 ++++ ...sources_for_arabic_sentiment_analysis.json | 36 ++++ datasets/laser.json | 36 ++++ ...tar__standard_arabic_phonetic_lexicon.json | 36 ++++ ...hological_analyzer_(sama)_version_3_1.json | 36 ++++ ...de_diplomatique__arabic_tagged_corpus.json | 36 ++++ datasets/lebanon_uprising_arabic_tweets.json | 36 ++++ datasets/let-mi.json | 36 ++++ ...ational_telephone_speech,_transcripts.json | 36 ++++ ...rabic_conversational_telephone_speech.json | 36 ++++ ...ing_data_set_4_(speech_+_transcripts).json | 36 ++++ ...arabic_qt_training_data_set_5,_speech.json | 36 ++++ ...c_qt_training_data_set_5,_transcripts.json | 36 ++++ ...nce_-_msa-da__(lid_-_code_switching_).json | 36 ++++ ...ince_-_msa-egy_(ner_-_code_switching).json | 36 ++++ datasets/lisan.json | 61 +++++++ datasets/lk-hadith-corpus.json | 36 ++++ datasets/madar.json | 36 ++++ datasets/madar_lexicon.json | 36 ++++ datasets/madar_twitter_corpus.json | 163 ++++++++++++++++++ datasets/madcat_phase_1_training_set.json | 36 ++++ datasets/madcat_phase_2_training_set.json | 36 ++++ datasets/madcat_phase_3_training_set.json | 36 ++++ datasets/maknuune.json | 36 ++++ ...abic_resources_for_sentiment_analysis.json | 36 ++++ ...rsum__moroccan_articles_summarisation.json | 36 ++++ datasets/masc.json | 36 ++++ .../masc__massive_arabic_speech_corpus.json | 36 ++++ .../masked_arab_states_dataset_(masd).json | 36 ++++ datasets/mawqif.json | 36 ++++ datasets/mc4.json | 36 ++++ datasets/mcwc.json | 36 ++++ datasets/mediaspeech.json | 36 ++++ datasets/medical_corpus.json | 36 ++++ datasets/mega-cov.json | 36 ++++ ...erged_arabic_corpus_of_isolated_words.json | 36 ++++ datasets/metrec.json | 36 ++++ datasets/mfqa.json | 36 ++++ datasets/mgb-2.json | 36 ++++ datasets/mgb-3.json | 36 ++++ datasets/mgb-5.json | 36 ++++ .../microsoft_terminology_collection.json | 36 ++++ datasets/mimic-it.json | 36 ++++ datasets/miracl.json | 36 ++++ datasets/mkqa.json | 36 ++++ datasets/ml_spoken_words.json | 36 ++++ datasets/mldr.json | 36 ++++ datasets/mlma_hate_speech.json | 36 ++++ datasets/mlqa.json | 36 ++++ datasets/mmac.json | 36 ++++ datasets/mmedc.json | 36 ++++ datasets/moarlex.json | 36 ++++ ...roccan_arabic_wikipedia_20230101_bots.json | 36 ++++ ...ccan_arabic_wikipedia_20230101_nobots.json | 36 ++++ ...zilla_foundation_common_voice_dataset.json | 36 ++++ ..._platforms_offensive_language_dataset.json | 36 ++++ datasets/mr__tydi.json | 36 ++++ datasets/msac.json | 36 ++++ datasets/msda.json | 36 ++++ datasets/mtvqa.json | 36 ++++ ...telephone_speech_2011_--_arabic_group.json | 55 ++++++ ...sh-french-arabic_trilingual_database.json" | 36 ++++ ...ual_hate\r\nspeech_detection_dataset.json" | 36 ++++ datasets/multilingual_lama.json | 36 ++++ datasets/multilingual_reward_bench.json | 36 ++++ datasets/multilingual_tts.json | 36 ++++ ...tiple-translation_arabic_(mta)_part_1.json | 36 ++++ ...tiple-translation_arabic_(mta)_part_2.json | 36 ++++ datasets/multitacred.json | 36 ++++ datasets/multiun_v2.json | 36 ++++ datasets/munazarat_1_0.json | 36 ++++ datasets/nabra.json | 36 ++++ datasets/nada.json | 36 ++++ datasets/nadi-2020.json | 163 ++++++++++++++++++ datasets/nadi-2021.json | 163 ++++++++++++++++++ datasets/nadia.json | 36 ++++ ...ic_fragments_for_inestimable_stemming.json | 36 ++++ ...im_mhedhbi_tunisian_dialect_corpus_v0.json | 36 ++++ datasets/named_entities_lexicon.json | 36 ++++ datasets/names_transliteration.json | 36 ++++ datasets/narabizi_corpus.json | 36 ++++ datasets/narabizi_treebank.json | 36 ++++ .../ne3l__named_entities_arabic_corpus.json | 36 ++++ ...d_speculation_in_arabic_review_(nsar).json | 36 ++++ .../nemlar__broadcast_news_speech_corpus.json | 36 ++++ datasets/nemlar__speech_synthesis_corpus.json | 36 ++++ datasets/nemlar__written_corpus.json | 36 ++++ datasets/nemlar_written_corpus.json | 36 ++++ ...ic_bnsc__broadcast_news_speech_corpus.json | 36 ++++ datasets/netransliteration.json | 36 ++++ datasets/news_commentary.json | 36 ++++ datasets/newstent.json | 36 ++++ datasets/nileulex.json | 36 ++++ ...chine_translation_(openmt)_evaluation.json | 36 ++++ ...chine_translation_(openmt)_evaluation.json | 36 ++++ ...chine_translation_(openmt)_evaluation.json | 36 ++++ ...chine_translation_(openmt)_evaluation.json | 36 ++++ ...chine_translation_(openmt)_evaluation.json | 36 ++++ ...anslation_(openmt)_progress_test_sets.json | 36 ++++ ...chine_translation_(openmt)_evaluation.json | 36 ++++ ...chine_translation_(openmt)_evaluation.json | 36 ++++ ...t)_progress_test_five_language_source.json | 36 ++++ ...ed_references_and_system_translations.json | 36 ++++ datasets/nlp_dataset_for_arabic_dialects.json | 67 +++++++ datasets/nsurl-2019_shared_task_8.json | 36 ++++ datasets/oasst2.json | 36 ++++ datasets/oca__opinion_corpus_for_arabic.json | 36 ++++ datasets/oclar.json | 36 ++++ datasets/offenseval_2020.json | 36 ++++ datasets/omcca.json | 49 ++++++ datasets/ontonotes_5_0.json | 36 ++++ datasets/ontonotes_release_3_0.json | 36 ++++ datasets/ontonotes_release_4_0.json | 36 ++++ datasets/ontonotes_release_5.json | 36 ++++ ...esponse_generation_in_arabic_dialects.json | 36 ++++ datasets/openiti-proc.json | 36 ++++ datasets/opensubtitles.json | 36 ++++ datasets/opus100.json | 36 ++++ datasets/opus_ubuntu.json | 36 ++++ datasets/opus_wikipedia.json | 36 ++++ ...ca__modern_colloquial_arabic_database.json | 36 ++++ ...msa_(modern_standard_arabic)_database.json | 36 ++++ ...msa_(modern_standard_arabic)_database.json | 36 ++++ ...msa_(modern_standard_arabic)_database.json | 36 ++++ datasets/osac.json | 36 ++++ datasets/oscar-2201.json | 36 ++++ datasets/oscar_small.json | 36 ++++ datasets/osian.json | 36 ++++ datasets/osman.json | 36 ++++ datasets/osman_un_corpus.json | 36 ++++ datasets/paad.json | 36 ++++ datasets/padic.json | 73 ++++++++ ...padic__parallel_arabic_dialect_corpus.json | 73 ++++++++ datasets/pan17_author_profiling.json | 36 ++++ datasets/pan18_author_profiling.json | 36 ++++ ...agiarism_detection_shared_task_corpus.json | 36 ++++ datasets/papluca_language-identification.json | 36 ++++ datasets/phonbank_arabic_kuwaiti_corpus.json | 36 ++++ datasets/phonemes_of_arabic.json | 36 ++++ ...__collins_multilingual_database_(mld).json | 36 ++++ datasets/polyglot-ner.json | 36 ++++ ...prague_arabic_dependency_treebank_1_0.json | 36 ++++ datasets/ptcc.json | 36 ++++ datasets/pulpo.json | 36 ++++ datasets/qa4mre.json | 36 ++++ datasets/qabas.json | 36 ++++ datasets/qac__qatari_arabic_corpus.json | 36 ++++ datasets/qadi_arabic.json | 145 ++++++++++++++++ datasets/qasr.json | 36 ++++ datasets/qatari_heritage_corpus.json | 36 ++++ datasets/qcri_parallel_tweets.json | 36 ++++ datasets/quran_hadith_datasets.json | 36 ++++ datasets/quran_speech__imam_+_users.json | 36 ++++ datasets/quranic_arabic_corpus.json | 36 ++++ datasets/rats_keyword_spotting.json | 36 ++++ datasets/rats_language_identification.json | 36 ++++ datasets/rats_speaker_identification.json | 36 ++++ datasets/rats_speech_activity_detection.json | 36 ++++ ...x_entity_translation_training_devtest.json | 36 ++++ datasets/religious_hate_speech.json | 36 ++++ datasets/rewayatech.json | 36 ++++ datasets/rsac.json | 36 ++++ datasets/sa`7r.json | 36 ++++ datasets/sad.json | 36 ++++ datasets/salma.json | 36 ++++ datasets/samer_readability_lexicon.json | 36 ++++ datasets/sanad.json | 36 ++++ ...nadset_650k__data_on_hadith_narrators.json | 36 ++++ datasets/saudinewsnet.json | 36 ++++ datasets/semeval-2017_task_4.json | 36 ++++ datasets/semeval-2018_task_1.json | 36 ++++ datasets/semeval-2021_task_2.json | 36 ++++ datasets/senti_lex.json | 36 ++++ .../sentiment_lexicons_for_81_languages.json | 36 ++++ datasets/senwave.json | 36 ++++ datasets/senzi.json | 36 ++++ ...etrieval_from_arabic_knowledge_graphs.json | 36 ++++ datasets/shakkelha.json | 36 ++++ datasets/shamela.json | 36 ++++ datasets/shamela_diacritics_corpus.json | 36 ++++ datasets/shamela_et_al_arabic_corpus.json | 36 ++++ datasets/shami.json | 61 +++++++ datasets/snad.json | 36 ++++ datasets/sohateful.json | 36 ++++ datasets/speech-massive.json | 36 ++++ datasets/spiral.json | 36 ++++ datasets/stopword_lists_for_19_languages.json | 36 ++++ datasets/student_university_corpus.json | 36 ++++ ...ct_tweets_about_ridesharing_companies.json | 36 ++++ ...ets_about_telecommunication_companies.json | 36 ++++ ...entiment_classification_pre_processed.json | 36 ++++ datasets/synonyms.json | 36 ++++ datasets/syria_tweets_sentiment_corpus.json | 36 ++++ datasets/talaa.json | 36 ++++ datasets/tanzil.json | 36 ++++ datasets/tapaco.json | 36 ++++ datasets/tarc.json | 36 ++++ datasets/tashkeela.json | 55 ++++++ datasets/tatoeba.json | 36 ++++ datasets/tatoeba_translation_challenge.json | 36 ++++ ...ilingual_broadcast_news_speech_corpus.json | 36 ++++ ...dt4_multilingual_text_and_annotations.json | 36 ++++ datasets/tdt5_multilingual_text.json | 36 ++++ datasets/tdt5_topics_and_annotations.json | 36 ++++ datasets/tead.json | 36 ++++ datasets/ted_talks_corpus_(wit3).json | 36 ++++ datasets/ted_talks_iwslt.json | 36 ++++ datasets/the_arabic_e-book_corpus.json | 36 ++++ ...abic_speech_corpus_for_isolated_words.json | 36 ++++ datasets/the_nine_books_of_arabic_hadith.json | 36 ++++ datasets/the_sadid_evaluation_datasets.json | 61 +++++++ ...(ace)_2003_multilingual_training_data.json | 36 ++++ datasets/toxi-text-3m.json | 36 ++++ ...sts_parallel_corpus_-_development_set.json | 36 ++++ ...ling_lists_parallel_corpus_-_test_set.json | 36 ++++ ...ewspaper_parallel_corpus_-_test_set_1.json | 36 ++++ ..._of_transcribed_broadcast_news_speech.json | 36 ++++ ...sh_web_domain_(blogs)_parallel_corpus.json | 36 ++++ ...sts_parallel_corpus_-_development_set.json | 36 ++++ ...ling_lists_parallel_corpus_-_test_set.json | 36 ++++ ...bic-french_parallel_text_--_newsgroup.json | 36 ++++ ...abic-french_parallel_text_--_newswire.json | 36 ++++ datasets/transliteration.json | 36 ++++ .../trecvid_2005_keyframes_&_transcripts.json | 36 ++++ datasets/trecvid_2006_keyframes.json | 36 ++++ datasets/troll_detection.json | 36 ++++ datasets/tsac.json | 36 ++++ datasets/tudicoi.json | 36 ++++ datasets/tufs_media.json | 36 ++++ datasets/tunisian_arabic_corpus.json | 36 ++++ datasets/tunizi.json | 36 ++++ datasets/twifil.json | 36 ++++ datasets/twt15da_lists.json | 127 ++++++++++++++ datasets/tydiqa.json | 36 ++++ datasets/udp_(udp-nyuad).json | 36 ++++ ...arallel_corpus_of_north_levantine_1_0.json | 36 ++++ datasets/ultimate_arabic_news_dataset.json | 36 ++++ datasets/un_multi.json | 36 ++++ ...ting_dangerous_speech_in_social_media.json | 36 ++++ ...linguistic_annotation_text_collection.json | 36 ++++ datasets/unimorph.json | 36 ++++ ..._nations_general_assembly_resolutions.json | 36 ++++ datasets/united_nations_parallel_corpus.json | 36 ++++ .../united_nations_proceedings_speech.json | 36 ++++ datasets/universal_dependencies.json | 55 ++++++ datasets/watan-2004.json | 36 ++++ datasets/waw.json | 36 ++++ datasets/wdc.json | 36 ++++ datasets/west_point_arabic_speech.json | 36 ++++ datasets/wikiann.json | 36 ++++ datasets/wikiann_ner(mmner).json | 36 ++++ datasets/wikidocsaligner_dataset.json | 49 ++++++ datasets/wikimatrix.json | 36 ++++ datasets/wikimedia_wit_base.json | 36 ++++ datasets/wikipedia.json | 36 ++++ datasets/wikiqaar.json | 36 ++++ datasets/wili-2018.json | 36 ++++ datasets/winomt_(mt_gender).json | 36 ++++ datasets/wojood.json | 36 ++++ datasets/wojoodfine.json | 36 ++++ datasets/wojoodgaza.json | 36 ++++ datasets/wojoodhadath.json | 36 ++++ datasets/wsd.json | 36 ++++ datasets/x-csr.json | 36 ++++ datasets/xcsr.json | 49 ++++++ datasets/xglue.json | 36 ++++ datasets/xl-headtags.json | 36 ++++ datasets/xlel_wd.json | 36 ++++ datasets/xlel_wd_dictionary.json | 36 ++++ datasets/xnli.json | 36 ++++ datasets/xor-tydi_qa.json | 36 ++++ datasets/xp3all.json | 36 ++++ datasets/xquad.json | 36 ++++ datasets/xquad_r.json | 36 ++++ .../xsid_-_(x)_slot_and_intent_detection.json | 36 ++++ datasets/xtreme.json | 36 ++++ datasets/zaebuc.json | 36 ++++ 706 files changed, 26984 insertions(+) create mode 100644 datasets/101_billion_arabic_words_dataset.json create mode 100644 datasets/1993-2007_united_nations_parallel_text.json create mode 100644 datasets/1997_hub5_arabic_evaluation.json create mode 100644 datasets/1997_hub5_arabic_transcripts.json create mode 100644 datasets/2003_nist_language_recognition_evaluation.json create mode 100644 datasets/2003_nist_rich_transcription_evaluation_data.json create mode 100644 datasets/2005_nist_speaker_recognition_evaluation_test_data.json create mode 100644 datasets/2005_nist_speaker_recognition_evaluation_training_data.json create mode 100644 datasets/2006_conll_shared_task_-_arabic_&_czech.json create mode 100644 datasets/2006_nist_speaker_recognition_evaluation_test_set_part_1.json create mode 100644 datasets/2006_nist_speaker_recognition_evaluation_test_set_part_2.json create mode 100644 datasets/2006_nist_speaker_recognition_evaluation_training_set.json create mode 100644 datasets/2006_nist_spoken_term_detection_development_set.json create mode 100644 datasets/2006_nist_spoken_term_detection_evaluation_set.json create mode 100644 datasets/2007_conll_shared_task_-_arabic_&_english.json create mode 100644 datasets/2007_nist_language_recognition_evaluation_supplemental_training_set.json create mode 100644 datasets/2007_nist_language_recognition_evaluation_test_set.json create mode 100644 datasets/2008_2010_nist_metrics_for_machine_translation_(metricsmatr)_gale_evaluation_set.json create mode 100644 datasets/2008_nist_metrics_for_machine_translation_(metricsmatr08)_development_data.json create mode 100644 datasets/2008_nist_speaker_recognition_evaluation_test_set.json create mode 100644 datasets/2008_nist_speaker_recognition_evaluation_training_set_part_1.json create mode 100644 datasets/2008_nist_speaker_recognition_evaluation_training_set_part_2.json create mode 100644 datasets/2011_nist_language_recognition_evaluation_test_set.json create mode 100644 datasets/2018_nist_speaker_recognition_evaluation_test_set.json create mode 100644 datasets/a-speechdb.json create mode 100644 datasets/a7'ta.json create mode 100644 datasets/a_corpus_of_arabic_literature_(19-20th_centuries)_for_stylometric_tests.json create mode 100644 datasets/absa-hotels.json create mode 100644 datasets/ace_2004_multilingual_training_corpus.json create mode 100644 datasets/ace_2005_multilingual_training_corpus.json create mode 100644 datasets/ace_2007_multilingual_training_corpus.json create mode 100644 datasets/acqad.json create mode 100644 datasets/adcc.json create mode 100644 datasets/adi-17.json create mode 100644 datasets/adi-5.json create mode 100644 datasets/adpbc.json create mode 100644 datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json create mode 100644 datasets/afewc.json create mode 100644 datasets/afnd.json create mode 100644 datasets/afrd__arabic_fake_reviews_detection.json create mode 100644 datasets/aghlat.json create mode 100644 datasets/ags.json create mode 100644 datasets/ai_society_translated.json create mode 100644 datasets/ajdir_corpora.json create mode 100644 datasets/ajgt.json create mode 100644 datasets/akec.json create mode 100644 datasets/al-hayat_arabic_corpus.json create mode 100644 datasets/alc__arabic_learner_corpus.json create mode 100644 datasets/aljazeera-dialectal_speech.json create mode 100644 datasets/aljazeera_deleted_comments.json create mode 100644 datasets/alr__arabic_laptop_reviews_dataset.json create mode 100644 datasets/alriyadh-newspaper-covid-dataset.json create mode 100644 datasets/alue.json create mode 100644 datasets/amara.json create mode 100644 datasets/an-nahar_newspaper_text_corpus.json create mode 100644 datasets/anad__arabic_natural_audio_dataset.json create mode 100644 datasets/anercorp.json create mode 100644 datasets/anetac.json create mode 100644 datasets/annotated_shami_corpus.json create mode 100644 datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json create mode 100644 datasets/ans_corpus___claim_verification.json create mode 100644 datasets/antcorpus.json create mode 100644 datasets/anti-social_behaviour_in_online_communication.json create mode 100644 datasets/aoc-aldi.json create mode 100644 datasets/aoc.json create mode 100644 datasets/apcd.json create mode 100644 datasets/apcd2.json create mode 100644 datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json create mode 100644 datasets/apgc_v2_0__arabic_parallel_gender_corpus_v2_0.json create mode 100644 datasets/aqad__arabic_question-answer_dataset.json create mode 100644 datasets/aqmar.json create mode 100644 datasets/aqqac.json create mode 100644 datasets/ar-asag.json create mode 100644 datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json create mode 100644 datasets/ara-timebank.json create mode 100644 datasets/arab-acquis.json create mode 100644 datasets/arab-andalusian_music_corpus.json create mode 100644 datasets/arab-esl.json create mode 100644 datasets/arab_states_analogy_dataset_(asad).json create mode 100644 datasets/arabceleb.json create mode 100644 datasets/arabench.json create mode 100644 datasets/arabglossbert.json create mode 100644 datasets/arabic-dataset-for-capt.json create mode 100644 datasets/arabic-dialect_english_parallel_text.json create mode 100644 datasets/arabic-english_named_entities_dataset.json create mode 100644 datasets/arabic-hebrew_ted_talks_parallel_corpus.json create mode 100644 datasets/arabic-multi-classification-dataset-amcd.json create mode 100644 datasets/arabic-news.json create mode 100644 datasets/arabic-ocr.json create mode 100644 datasets/arabic-openhermes-2_5.json create mode 100644 datasets/arabic-poem-emotion.json create mode 100644 datasets/arabic-stories-corpus.json create mode 100644 datasets/arabic_-_egyptian_comparable_wikipedia_corpus.json create mode 100644 datasets/arabic_100k_reviews.json create mode 100644 datasets/arabic_ala_lc__romanization.json create mode 100644 datasets/arabic_analogy.json create mode 100644 datasets/arabic_billion_words.json create mode 100644 datasets/arabic_broadcast_news_speech.json create mode 100644 datasets/arabic_broadcast_news_transcripts.json create mode 100644 datasets/arabic_business_corpora.json create mode 100644 datasets/arabic_common_voice.json create mode 100644 datasets/arabic_cts_levantine_fisher_training_data_set_3,_speech.json create mode 100644 datasets/arabic_cts_levantine_fisher_training_data_set_3,_transcripts.json create mode 100644 datasets/arabic_dev2.json create mode 100644 datasets/arabic_dialects_dataset.json create mode 100644 datasets/arabic_dictionary_of_inflected_words.json create mode 100644 datasets/arabic_document_classification_dataset.json create mode 100644 datasets/arabic_empathetic_dialogues.json create mode 100644 datasets/arabic_english_parallel_news_part_1.json create mode 100644 datasets/arabic_flood_twitter_dataset.json create mode 100644 datasets/arabic_gigaword.json create mode 100644 datasets/arabic_gigaword_fifth_edition.json create mode 100644 datasets/arabic_gigaword_fourth_edition.json create mode 100644 datasets/arabic_gigaword_second_edition.json create mode 100644 datasets/arabic_gigaword_third_edition.json create mode 100644 datasets/arabic_hate_speech_2022_shared_task.json create mode 100644 datasets/arabic_infectious_disease_ontology.json create mode 100644 datasets/arabic_keyphrase_dataset.json create mode 100644 datasets/arabic_morphological_dictionary.json create mode 100644 datasets/arabic_named_entities.json create mode 100644 datasets/arabic_named_entity_gazetteer.json create mode 100644 datasets/arabic_natural_audio_dataset.json create mode 100644 datasets/arabic_news_articles.json create mode 100644 datasets/arabic_news_articles_from_aljazeera_net.json create mode 100644 datasets/arabic_news_dataset_about_hajj.json create mode 100644 datasets/arabic_news_translation_text_part_1.json create mode 100644 datasets/arabic_news_tweets.json create mode 100644 datasets/arabic_newswire_english_translation_collection.json create mode 100644 datasets/arabic_newswire_part_1.json create mode 100644 datasets/arabic_ontology.json create mode 100644 datasets/arabic_osact4___offensive_language_detection.json create mode 100644 datasets/arabic_osact5___arabic_hate_speech.json create mode 100644 datasets/arabic_oscar.json create mode 100644 datasets/arabic_pos_dialect.json create mode 100644 datasets/arabic_punctuation_dataset.json create mode 100644 datasets/arabic_rc_datasets.json create mode 100644 datasets/arabic_satire_dataset.json create mode 100644 datasets/arabic_satirical_fake_news_dataset.json create mode 100644 datasets/arabic_senti-lexicon.json create mode 100644 datasets/arabic_sentiment_lexicons.json create mode 100644 datasets/arabic_sentiment_twitter_corpus.json create mode 100644 datasets/arabic_sms_chat.json create mode 100644 datasets/arabic_spam_and_ham_tweets.json create mode 100644 datasets/arabic_speech_commands_dataset.json create mode 100644 datasets/arabic_speech_corpus.json create mode 100644 datasets/arabic_speech_recognition_pronunciation_dictionary.json create mode 100644 datasets/arabic_stop_words.json create mode 100644 datasets/arabic_text_diacritization.json create mode 100644 datasets/arabic_textual_entailment_dataset.json create mode 100644 datasets/arabic_treebank_-_broadcast_news_v1_0.json create mode 100644 datasets/arabic_treebank_-_weblog.json create mode 100644 datasets/arabic_treebank__part_1_-_10k-word_english_translation.json create mode 100644 datasets/arabic_treebank__part_1_v_2_0.json create mode 100644 datasets/arabic_treebank__part_1_v_3_0_(pos_with_full_vocalization_+_syntactic_analysis).json create mode 100644 datasets/arabic_treebank__part_1_v_4_1.json create mode 100644 datasets/arabic_treebank__part_2_v_2_0.json create mode 100644 datasets/arabic_treebank__part_2_v_3_1.json create mode 100644 datasets/arabic_treebank__part_3.json create mode 100644 datasets/arabic_treebank__part_3_v_1_0.json create mode 100644 datasets/arabic_treebank__part_3_v_3_2.json create mode 100644 datasets/arabic_treebank__part_4_v_1_0_(mpg_annotation).json create mode 100644 datasets/arabic_tweets_about_infectious_diseases.json create mode 100644 datasets/arabic_wiki_data_dump_2018.json create mode 100644 datasets/arabic_wikipedia_20230101_bots.json create mode 100644 datasets/arabic_wikipedia_20230101_nobots.json create mode 100644 datasets/arabic_wikipedia_talk_pages.json create mode 100644 datasets/arabic_wikireading_and_kaiflematha.json create mode 100644 datasets/arabicaqa.json create mode 100644 datasets/arabichatespeechdataset.json create mode 100644 datasets/arabicmmlu.json create mode 100644 datasets/arabicqa_2_1m.json create mode 100644 datasets/arabicsa.json create mode 100644 datasets/arabicsenamticsimilaritydataset.json create mode 100644 datasets/arabicweb16.json create mode 100644 datasets/arabicweb24.json create mode 100644 datasets/arablex__database_of_arabic_general_vocabulary_(dag).json create mode 100644 datasets/arabscribe.json create mode 100644 datasets/aracovid19-mfh.json create mode 100644 datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json create mode 100644 datasets/aracust.json create mode 100644 datasets/aradata.json create mode 100644 datasets/arafacts.json create mode 100644 datasets/aranews.json create mode 100644 datasets/aranpcc.json create mode 100644 datasets/arap-tweet_corpus.json create mode 100644 datasets/arasencorpus.json create mode 100644 datasets/arasenti.json create mode 100644 datasets/araspider.json create mode 100644 datasets/arastance.json create mode 100644 datasets/arasum_corpus.json create mode 100644 datasets/arbanking77.json create mode 100644 datasets/arc-wmi.json create mode 100644 datasets/arcd.json create mode 100644 datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json create mode 100644 datasets/arcov-19.json create mode 100644 datasets/arcov19-rumors.json create mode 100644 datasets/arcovidvac.json create mode 100644 datasets/areej.json create mode 100644 datasets/arentail.json create mode 100644 datasets/arl_arabic_dependency_treebank.json create mode 100644 datasets/armath.json create mode 100644 datasets/armi__arabic_misogynistic_dataset.json create mode 100644 datasets/arparallel.json create mode 100644 datasets/arpc__a_corpus_for_paraphrase_identification_in_arabic_text.json create mode 100644 datasets/arpod.json create mode 100644 datasets/arqat-aqi__answerable_question_identification_in_arabic_tweets.json create mode 100644 datasets/arquad.json create mode 100644 datasets/arsarcasm-v2.json create mode 100644 datasets/arsarcasm.json create mode 100644 datasets/arsarcasmoji.json create mode 100644 datasets/arsas.json create mode 100644 datasets/arsen-20.json create mode 100644 datasets/arsenl.json create mode 100644 datasets/arsentd-lev.json create mode 100644 datasets/arsentiment.json create mode 100644 datasets/arsl21l.json create mode 100644 datasets/artest.json create mode 100644 datasets/artrivia.json create mode 100644 datasets/arvox.json create mode 100644 datasets/arwiki.json create mode 100644 datasets/arzen-multigenre.json create mode 100644 datasets/asad.json create mode 100644 datasets/asayar.json create mode 100644 datasets/ashaar.json create mode 100644 datasets/askfm.json create mode 100644 datasets/astad.json create mode 100644 datasets/astd.json create mode 100644 datasets/at-odtsa.json create mode 100644 datasets/atar.json create mode 100644 datasets/athar.json create mode 100644 datasets/attimam.json create mode 100644 datasets/author_attribution_tweets.json create mode 100644 datasets/autotweet.json create mode 100644 datasets/aya_dataset.json create mode 100644 datasets/ayatec.json create mode 100644 datasets/baec.json create mode 100644 datasets/baladi_lebanese_dialect_corpora.json create mode 100644 datasets/baved.json create mode 100644 datasets/bbn_aub_darpa_babylon_levantine_arabic_speech_and_transcripts.json create mode 100644 datasets/bbn_blog_posts_sentiment_corpus.json create mode 100644 datasets/belebele.json create mode 100644 datasets/bible_para.json create mode 100644 datasets/bnl_historical_newspapers.json create mode 100644 datasets/bolt_arabic_discussion_forum_parallel_training_data.json create mode 100644 datasets/bolt_arabic_discussion_forums.json create mode 100644 datasets/bolt_egyptian-english_word_alignment_--_discussion_forum_training.json create mode 100644 datasets/bolt_egyptian_arabic-english_word_alignment_--_conversational_telephone_speech_training.json create mode 100644 datasets/bolt_egyptian_arabic-english_word_alignment_--_sms_chat_training.json create mode 100644 datasets/bolt_egyptian_arabic_co-reference_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json create mode 100644 datasets/bolt_egyptian_arabic_propbank_and_sense_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json create mode 100644 datasets/bolt_egyptian_arabic_sms_chat_parallel_training_data.json create mode 100644 datasets/bolt_egyptian_arabic_treebank_-_conversational_telephone_speech.json create mode 100644 datasets/bolt_egyptian_arabic_treebank_-_discussion_forum.json create mode 100644 datasets/bolt_egyptian_arabic_treebank_-_sms_chat.json create mode 100644 datasets/bolt_information_retrieval_comprehensive_training_and_evaluation.json create mode 100644 datasets/botta.json create mode 100644 datasets/brad_1_0.json create mode 100644 datasets/brad_2_0.json create mode 100644 datasets/buckwalter_arabic_morphological_analyzer_version_1_0.json create mode 100644 datasets/buckwalter_arabic_morphological_analyzer_version_2_0.json create mode 100644 datasets/calima-glf.json create mode 100644 datasets/callfriend_egyptian_arabic.json create mode 100644 datasets/callfriend_egyptian_arabic_second_edition.json create mode 100644 datasets/callhome__egyptian_arabic_speech_translation_corpus.json create mode 100644 datasets/callhome_egyptian_arabic_speech.json create mode 100644 datasets/callhome_egyptian_arabic_speech_supplement.json create mode 100644 datasets/callhome_egyptian_arabic_transcripts.json create mode 100644 datasets/callhome_egyptian_arabic_transcripts_supplement.json create mode 100644 datasets/calliar.json create mode 100644 datasets/calyou.json create mode 100644 datasets/cameltb__camel_treebank_1_0.json create mode 100644 datasets/canercorpus.json create mode 100644 datasets/cc-100.json create mode 100644 datasets/ccaligned.json create mode 100644 datasets/ccmatrix.json create mode 100644 datasets/ceap.json create mode 100644 datasets/checkthat-ar.json create mode 100644 datasets/childes_egyptian_arabic_salama_corpus.json create mode 100644 datasets/ciad__corpus_of_iraqi_arabic_dialect.json create mode 100644 datasets/cidar.json create mode 100644 datasets/clartts__an_open-source_classical_arabic_text-to-speech_corpus.json create mode 100644 datasets/classical_arabic_dictionary.json create mode 100644 datasets/cleananercorp.json create mode 100644 datasets/coda__(conventional_orthography_for_dialectal_arabic).json create mode 100644 datasets/commonlanguage.json create mode 100644 datasets/commonsense_validation.json create mode 100644 datasets/comparable_wikipedia_coprus.json create mode 100644 datasets/comprehensive_word_lists_for_chinese,_japanese,_korean_and_arabic.json create mode 100644 datasets/cormap__corpus_for_moroccan_arabic_processing.json create mode 100644 datasets/coronavirus.json create mode 100644 datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json create mode 100644 datasets/corpus_of_offensive_language_in_arabic.json create mode 100644 datasets/covid-19-arabic-tweets-dataset.json create mode 100644 datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json create mode 100644 datasets/covid-fakes.json create mode 100644 datasets/covost_2.json create mode 100644 datasets/cqa-md__semeval-2016_task_3.json create mode 100644 datasets/cross-lingual_ner.json create mode 100644 datasets/crosssum.json create mode 100644 datasets/cslu__22_languages_corpus.json create mode 100644 datasets/ctab__corpus_of_tunisian_arabizi.json create mode 100644 datasets/culturax.json create mode 100644 datasets/curras.json create mode 100644 datasets/daict.json create mode 100644 datasets/dares.json create mode 100644 datasets/dart.json create mode 100644 datasets/database_of_arab_names.json create mode 100644 datasets/database_of_arabic_plurals.json create mode 100644 datasets/database_of_foreign_names_in_arabic.json create mode 100644 datasets/dataset_for_arabic_classification.json create mode 100644 datasets/dataset_for_evaluating_root_extraction.json create mode 100644 datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json create mode 100644 datasets/defarabicqa.json create mode 100644 datasets/detect_egyptian_wikipedia_template-translated_articles.json create mode 100644 datasets/dialectal_arabic_code-switching_dataset.json create mode 100644 datasets/dialex.json create mode 100644 datasets/disease_ner.json create mode 100644 datasets/dixaf__bilingual_dictionary_french_arabic,_arabic_french.json create mode 100644 datasets/doda__darija_open_dataset.json create mode 100644 datasets/dzdc12.json create mode 100644 datasets/dziribert.json create mode 100644 datasets/easc.json create mode 100644 datasets/edgad.json create mode 100644 datasets/egyptian_arabic_wikipedia_20230101.json create mode 100644 datasets/egyptian_colloquial_arabic_lexicon.json create mode 100644 datasets/elecmorocco2016.json create mode 100644 datasets/emoji-sentiment-dataset.json create mode 100644 datasets/emotional-tone.json create mode 100644 datasets/english-arabic_treebank_v_1_0.json create mode 100644 datasets/everyayah.json create mode 100644 datasets/evetar.json create mode 100644 datasets/exams.json create mode 100644 datasets/fisher_levantine_arabic_conversational_telephone_speech,_transcripts.json create mode 100644 datasets/fisher_levantine_arabic_conversational_telephone_speech.json create mode 100644 datasets/flodusta.json create mode 100644 datasets/flores-101.json create mode 100644 datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_1.json create mode 100644 datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_2.json create mode 100644 datasets/gale_arabic-english_parallel_aligned_treebank_--_newswire.json create mode 100644 datasets/gale_arabic-english_parallel_aligned_treebank_--_web_training.json create mode 100644 datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_1.json create mode 100644 datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_2.json create mode 100644 datasets/gale_arabic-english_word_alignment_training_part_1_--_newswire_and_web.json create mode 100644 datasets/gale_arabic-english_word_alignment_training_part_2_--_newswire.json create mode 100644 datasets/gale_arabic-english_word_alignment_training_part_3_--_web.json create mode 100644 datasets/gale_phase_1_arabic_blog_parallel_text.json create mode 100644 datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_1.json create mode 100644 datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_2.json create mode 100644 datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_1.json create mode 100644 datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_2.json create mode 100644 datasets/gale_phase_1_distillation_training.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_1.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_2.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_1.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_2.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_1.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_2.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_news_parallel_text.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_news_speech_part_1.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_news_speech_part_2.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_1.json create mode 100644 datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_2.json create mode 100644 datasets/gale_phase_2_arabic_newswire_parallel_text.json create mode 100644 datasets/gale_phase_2_arabic_web_parallel_text.json create mode 100644 datasets/gale_phase_3_and_4_arabic_broadcast_conversation_parallel_text.json create mode 100644 datasets/gale_phase_3_and_4_arabic_broadcast_news_parallel_text.json create mode 100644 datasets/gale_phase_3_and_4_arabic_newswire_parallel_text.json create mode 100644 datasets/gale_phase_3_and_4_arabic_web_parallel_text.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_1.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_2.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_1.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_2.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_news_speech_part_1.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_news_speech_part_2.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_1.json create mode 100644 datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_2.json create mode 100644 datasets/gale_phase_4_arabic_broadcast_conversation_parallel_sentences.json create mode 100644 datasets/gale_phase_4_arabic_broadcast_conversation_speech.json create mode 100644 datasets/gale_phase_4_arabic_broadcast_conversation_transcripts.json create mode 100644 datasets/gale_phase_4_arabic_broadcast_news_parallel_sentences.json create mode 100644 datasets/gale_phase_4_arabic_broadcast_news_speech.json create mode 100644 datasets/gale_phase_4_arabic_broadcast_news_transcripts.json create mode 100644 datasets/gale_phase_4_arabic_newswire_parallel_sentences.json create mode 100644 datasets/gale_phase_4_arabic_weblog_parallel_sentences.json create mode 100644 datasets/gem.json create mode 100644 datasets/gem_-_wikilingua.json create mode 100644 datasets/gem_-_xlsum.json create mode 100644 datasets/geowac.json create mode 100644 datasets/glare.json create mode 100644 datasets/gnome.json create mode 100644 datasets/goud-sum.json create mode 100644 datasets/gulf_arabic_conversational_telephone_speech,_transcripts.json create mode 100644 datasets/gulf_arabic_conversational_telephone_speech.json create mode 100644 datasets/gumar.json create mode 100644 datasets/haad.json create mode 100644 datasets/habibi.json create mode 100644 datasets/hard.json create mode 100644 datasets/hc_corpora.json create mode 100644 datasets/hijja.json create mode 100644 datasets/hyter_networks_of_selected_openmt08_09_sentences.json create mode 100644 datasets/idat.json create mode 100644 datasets/idrisi-r.json create mode 100644 datasets/inaracorpus.json create mode 100644 datasets/infopankki_v1.json create mode 100644 datasets/international_corpus_of_arabic.json create mode 100644 datasets/iraqi_arabic_conversational_telephone_speech,_transcripts.json create mode 100644 datasets/iraqi_arabic_conversational_telephone_speech.json create mode 100644 datasets/isarcasmeval__semeval-2022_task_6.json create mode 100644 datasets/isi_arabic-english_automatically_extracted_parallel_text.json create mode 100644 datasets/jana__a_human-human_dialogues_corpus_for_egyptian_dialect.json create mode 100644 datasets/journalists_questions.json create mode 100644 datasets/kacst.json create mode 100644 datasets/kalamdz.json create mode 100644 datasets/kalimat.json create mode 100644 datasets/kawarith.json create mode 100644 datasets/kde4.json create mode 100644 datasets/khaleej-2004.json create mode 100644 datasets/khalidalt_tydiqa-goldp.json create mode 100644 datasets/khawas.json create mode 100644 datasets/kind.json create mode 100644 datasets/ksaa-rd_dataset.json create mode 100644 datasets/ksu_rich_arabic_speech_database.json create mode 100644 datasets/ksucca_corpus.json create mode 100644 datasets/ksuemotions.json create mode 100644 datasets/kunuz.json create mode 100644 datasets/l-hsab.json create mode 100644 datasets/labr.json create mode 100644 datasets/lama.json create mode 100644 datasets/language_identification.json create mode 100644 datasets/language_understanding_annotation_corpus.json create mode 100644 datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json create mode 100644 datasets/laser.json create mode 100644 datasets/lc-star__standard_arabic_phonetic_lexicon.json create mode 100644 datasets/ldc_standard_arabic_morphological_analyzer_(sama)_version_3_1.json create mode 100644 datasets/le_monde_diplomatique__arabic_tagged_corpus.json create mode 100644 datasets/lebanon_uprising_arabic_tweets.json create mode 100644 datasets/let-mi.json create mode 100644 datasets/levantine_arabic_conversational_telephone_speech,_transcripts.json create mode 100644 datasets/levantine_arabic_conversational_telephone_speech.json create mode 100644 datasets/levantine_arabic_qt_training_data_set_4_(speech_+_transcripts).json create mode 100644 datasets/levantine_arabic_qt_training_data_set_5,_speech.json create mode 100644 datasets/levantine_arabic_qt_training_data_set_5,_transcripts.json create mode 100644 datasets/lince_-_msa-da__(lid_-_code_switching_).json create mode 100644 datasets/lince_-_msa-egy_(ner_-_code_switching).json create mode 100644 datasets/lisan.json create mode 100644 datasets/lk-hadith-corpus.json create mode 100644 datasets/madar.json create mode 100644 datasets/madar_lexicon.json create mode 100644 datasets/madar_twitter_corpus.json create mode 100644 datasets/madcat_phase_1_training_set.json create mode 100644 datasets/madcat_phase_2_training_set.json create mode 100644 datasets/madcat_phase_3_training_set.json create mode 100644 datasets/maknuune.json create mode 100644 datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json create mode 100644 datasets/marsum__moroccan_articles_summarisation.json create mode 100644 datasets/masc.json create mode 100644 datasets/masc__massive_arabic_speech_corpus.json create mode 100644 datasets/masked_arab_states_dataset_(masd).json create mode 100644 datasets/mawqif.json create mode 100644 datasets/mc4.json create mode 100644 datasets/mcwc.json create mode 100644 datasets/mediaspeech.json create mode 100644 datasets/medical_corpus.json create mode 100644 datasets/mega-cov.json create mode 100644 datasets/merged_arabic_corpus_of_isolated_words.json create mode 100644 datasets/metrec.json create mode 100644 datasets/mfqa.json create mode 100644 datasets/mgb-2.json create mode 100644 datasets/mgb-3.json create mode 100644 datasets/mgb-5.json create mode 100644 datasets/microsoft_terminology_collection.json create mode 100644 datasets/mimic-it.json create mode 100644 datasets/miracl.json create mode 100644 datasets/mkqa.json create mode 100644 datasets/ml_spoken_words.json create mode 100644 datasets/mldr.json create mode 100644 datasets/mlma_hate_speech.json create mode 100644 datasets/mlqa.json create mode 100644 datasets/mmac.json create mode 100644 datasets/mmedc.json create mode 100644 datasets/moarlex.json create mode 100644 datasets/moroccan_arabic_wikipedia_20230101_bots.json create mode 100644 datasets/moroccan_arabic_wikipedia_20230101_nobots.json create mode 100644 datasets/mozilla_foundation_common_voice_dataset.json create mode 100644 datasets/mpold__multi_platforms_offensive_language_dataset.json create mode 100644 datasets/mr__tydi.json create mode 100644 datasets/msac.json create mode 100644 datasets/msda.json create mode 100644 datasets/mtvqa.json create mode 100644 datasets/multi-language_conversational_telephone_speech_2011_--_arabic_group.json create mode 100644 "datasets/multilingual_dictionary_of_sports__\342\200\223_english-french-arabic_trilingual_database.json" create mode 100644 "datasets/multilingual_hate\r\nspeech_detection_dataset.json" create mode 100644 datasets/multilingual_lama.json create mode 100644 datasets/multilingual_reward_bench.json create mode 100644 datasets/multilingual_tts.json create mode 100644 datasets/multiple-translation_arabic_(mta)_part_1.json create mode 100644 datasets/multiple-translation_arabic_(mta)_part_2.json create mode 100644 datasets/multitacred.json create mode 100644 datasets/multiun_v2.json create mode 100644 datasets/munazarat_1_0.json create mode 100644 datasets/nabra.json create mode 100644 datasets/nada.json create mode 100644 datasets/nadi-2020.json create mode 100644 datasets/nadi-2021.json create mode 100644 datasets/nadia.json create mode 100644 datasets/nafis__normalized_arabic_fragments_for_inestimable_stemming.json create mode 100644 datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json create mode 100644 datasets/named_entities_lexicon.json create mode 100644 datasets/names_transliteration.json create mode 100644 datasets/narabizi_corpus.json create mode 100644 datasets/narabizi_treebank.json create mode 100644 datasets/ne3l__named_entities_arabic_corpus.json create mode 100644 datasets/negation_and_speculation_in_arabic_review_(nsar).json create mode 100644 datasets/nemlar__broadcast_news_speech_corpus.json create mode 100644 datasets/nemlar__speech_synthesis_corpus.json create mode 100644 datasets/nemlar__written_corpus.json create mode 100644 datasets/nemlar_written_corpus.json create mode 100644 datasets/netdc_arabic_bnsc__broadcast_news_speech_corpus.json create mode 100644 datasets/netransliteration.json create mode 100644 datasets/news_commentary.json create mode 100644 datasets/newstent.json create mode 100644 datasets/nileulex.json create mode 100644 datasets/nist_2002_open_machine_translation_(openmt)_evaluation.json create mode 100644 datasets/nist_2003_open_machine_translation_(openmt)_evaluation.json create mode 100644 datasets/nist_2004_open_machine_translation_(openmt)_evaluation.json create mode 100644 datasets/nist_2005_open_machine_translation_(openmt)_evaluation.json create mode 100644 datasets/nist_2006_open_machine_translation_(openmt)_evaluation.json create mode 100644 datasets/nist_2008-2012_open_machine_translation_(openmt)_progress_test_sets.json create mode 100644 datasets/nist_2008_open_machine_translation_(openmt)_evaluation.json create mode 100644 datasets/nist_2009_open_machine_translation_(openmt)_evaluation.json create mode 100644 datasets/nist_2012_open_machine_translation_(openmt)_progress_test_five_language_source.json create mode 100644 datasets/nist_open_mt_2008_evaluation_(mt08)_selected_references_and_system_translations.json create mode 100644 datasets/nlp_dataset_for_arabic_dialects.json create mode 100644 datasets/nsurl-2019_shared_task_8.json create mode 100644 datasets/oasst2.json create mode 100644 datasets/oca__opinion_corpus_for_arabic.json create mode 100644 datasets/oclar.json create mode 100644 datasets/offenseval_2020.json create mode 100644 datasets/omcca.json create mode 100644 datasets/ontonotes_5_0.json create mode 100644 datasets/ontonotes_release_3_0.json create mode 100644 datasets/ontonotes_release_4_0.json create mode 100644 datasets/ontonotes_release_5.json create mode 100644 datasets/open-domain_response_generation_in_arabic_dialects.json create mode 100644 datasets/openiti-proc.json create mode 100644 datasets/opensubtitles.json create mode 100644 datasets/opus100.json create mode 100644 datasets/opus_ubuntu.json create mode 100644 datasets/opus_wikipedia.json create mode 100644 datasets/orientel_egypt_mca__modern_colloquial_arabic_database.json create mode 100644 datasets/orientel_egypt_msa_(modern_standard_arabic)_database.json create mode 100644 datasets/orientel_jordan_msa_(modern_standard_arabic)_database.json create mode 100644 datasets/orientel_tunisia_msa_(modern_standard_arabic)_database.json create mode 100644 datasets/osac.json create mode 100644 datasets/oscar-2201.json create mode 100644 datasets/oscar_small.json create mode 100644 datasets/osian.json create mode 100644 datasets/osman.json create mode 100644 datasets/osman_un_corpus.json create mode 100644 datasets/paad.json create mode 100644 datasets/padic.json create mode 100644 datasets/padic__parallel_arabic_dialect_corpus.json create mode 100644 datasets/pan17_author_profiling.json create mode 100644 datasets/pan18_author_profiling.json create mode 100644 datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json create mode 100644 datasets/papluca_language-identification.json create mode 100644 datasets/phonbank_arabic_kuwaiti_corpus.json create mode 100644 datasets/phonemes_of_arabic.json create mode 100644 datasets/phrasebank__collins_multilingual_database_(mld).json create mode 100644 datasets/polyglot-ner.json create mode 100644 datasets/prague_arabic_dependency_treebank_1_0.json create mode 100644 datasets/ptcc.json create mode 100644 datasets/pulpo.json create mode 100644 datasets/qa4mre.json create mode 100644 datasets/qabas.json create mode 100644 datasets/qac__qatari_arabic_corpus.json create mode 100644 datasets/qadi_arabic.json create mode 100644 datasets/qasr.json create mode 100644 datasets/qatari_heritage_corpus.json create mode 100644 datasets/qcri_parallel_tweets.json create mode 100644 datasets/quran_hadith_datasets.json create mode 100644 datasets/quran_speech__imam_+_users.json create mode 100644 datasets/quranic_arabic_corpus.json create mode 100644 datasets/rats_keyword_spotting.json create mode 100644 datasets/rats_language_identification.json create mode 100644 datasets/rats_speaker_identification.json create mode 100644 datasets/rats_speech_activity_detection.json create mode 100644 datasets/reflex_entity_translation_training_devtest.json create mode 100644 datasets/religious_hate_speech.json create mode 100644 datasets/rewayatech.json create mode 100644 datasets/rsac.json create mode 100644 datasets/sa`7r.json create mode 100644 datasets/sad.json create mode 100644 datasets/salma.json create mode 100644 datasets/samer_readability_lexicon.json create mode 100644 datasets/sanad.json create mode 100644 datasets/sanadset_650k__data_on_hadith_narrators.json create mode 100644 datasets/saudinewsnet.json create mode 100644 datasets/semeval-2017_task_4.json create mode 100644 datasets/semeval-2018_task_1.json create mode 100644 datasets/semeval-2021_task_2.json create mode 100644 datasets/senti_lex.json create mode 100644 datasets/sentiment_lexicons_for_81_languages.json create mode 100644 datasets/senwave.json create mode 100644 datasets/senzi.json create mode 100644 datasets/serag__semantic_entity_retrieval_from_arabic_knowledge_graphs.json create mode 100644 datasets/shakkelha.json create mode 100644 datasets/shamela.json create mode 100644 datasets/shamela_diacritics_corpus.json create mode 100644 datasets/shamela_et_al_arabic_corpus.json create mode 100644 datasets/shami.json create mode 100644 datasets/snad.json create mode 100644 datasets/sohateful.json create mode 100644 datasets/speech-massive.json create mode 100644 datasets/spiral.json create mode 100644 datasets/stopword_lists_for_19_languages.json create mode 100644 datasets/student_university_corpus.json create mode 100644 datasets/sudanese_dialect_tweets_about_ridesharing_companies.json create mode 100644 datasets/sudanese_dialect_tweets_about_telecommunication_companies.json create mode 100644 datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json create mode 100644 datasets/synonyms.json create mode 100644 datasets/syria_tweets_sentiment_corpus.json create mode 100644 datasets/talaa.json create mode 100644 datasets/tanzil.json create mode 100644 datasets/tapaco.json create mode 100644 datasets/tarc.json create mode 100644 datasets/tashkeela.json create mode 100644 datasets/tatoeba.json create mode 100644 datasets/tatoeba_translation_challenge.json create mode 100644 datasets/tdt4_multilingual_broadcast_news_speech_corpus.json create mode 100644 datasets/tdt4_multilingual_text_and_annotations.json create mode 100644 datasets/tdt5_multilingual_text.json create mode 100644 datasets/tdt5_topics_and_annotations.json create mode 100644 datasets/tead.json create mode 100644 datasets/ted_talks_corpus_(wit3).json create mode 100644 datasets/ted_talks_iwslt.json create mode 100644 datasets/the_arabic_e-book_corpus.json create mode 100644 datasets/the_arabic_speech_corpus_for_isolated_words.json create mode 100644 datasets/the_nine_books_of_arabic_hadith.json create mode 100644 datasets/the_sadid_evaluation_datasets.json create mode 100644 datasets/tides_extraction_(ace)_2003_multilingual_training_data.json create mode 100644 datasets/toxi-text-3m.json create mode 100644 datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json create mode 100644 datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json create mode 100644 datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json create mode 100644 datasets/trad_arabic-english_parallel_corpus_of_transcribed_broadcast_news_speech.json create mode 100644 datasets/trad_arabic-english_web_domain_(blogs)_parallel_corpus.json create mode 100644 datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_development_set.json create mode 100644 datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_test_set.json create mode 100644 datasets/trad_arabic-french_parallel_text_--_newsgroup.json create mode 100644 datasets/trad_arabic-french_parallel_text_--_newswire.json create mode 100644 datasets/transliteration.json create mode 100644 datasets/trecvid_2005_keyframes_&_transcripts.json create mode 100644 datasets/trecvid_2006_keyframes.json create mode 100644 datasets/troll_detection.json create mode 100644 datasets/tsac.json create mode 100644 datasets/tudicoi.json create mode 100644 datasets/tufs_media.json create mode 100644 datasets/tunisian_arabic_corpus.json create mode 100644 datasets/tunizi.json create mode 100644 datasets/twifil.json create mode 100644 datasets/twt15da_lists.json create mode 100644 datasets/tydiqa.json create mode 100644 datasets/udp_(udp-nyuad).json create mode 100644 datasets/ufal_parallel_corpus_of_north_levantine_1_0.json create mode 100644 datasets/ultimate_arabic_news_dataset.json create mode 100644 datasets/un_multi.json create mode 100644 datasets/understanding_and_detecting_dangerous_speech_in_social_media.json create mode 100644 datasets/unified_linguistic_annotation_text_collection.json create mode 100644 datasets/unimorph.json create mode 100644 datasets/united_nations_general_assembly_resolutions.json create mode 100644 datasets/united_nations_parallel_corpus.json create mode 100644 datasets/united_nations_proceedings_speech.json create mode 100644 datasets/universal_dependencies.json create mode 100644 datasets/watan-2004.json create mode 100644 datasets/waw.json create mode 100644 datasets/wdc.json create mode 100644 datasets/west_point_arabic_speech.json create mode 100644 datasets/wikiann.json create mode 100644 datasets/wikiann_ner(mmner).json create mode 100644 datasets/wikidocsaligner_dataset.json create mode 100644 datasets/wikimatrix.json create mode 100644 datasets/wikimedia_wit_base.json create mode 100644 datasets/wikipedia.json create mode 100644 datasets/wikiqaar.json create mode 100644 datasets/wili-2018.json create mode 100644 datasets/winomt_(mt_gender).json create mode 100644 datasets/wojood.json create mode 100644 datasets/wojoodfine.json create mode 100644 datasets/wojoodgaza.json create mode 100644 datasets/wojoodhadath.json create mode 100644 datasets/wsd.json create mode 100644 datasets/x-csr.json create mode 100644 datasets/xcsr.json create mode 100644 datasets/xglue.json create mode 100644 datasets/xl-headtags.json create mode 100644 datasets/xlel_wd.json create mode 100644 datasets/xlel_wd_dictionary.json create mode 100644 datasets/xnli.json create mode 100644 datasets/xor-tydi_qa.json create mode 100644 datasets/xp3all.json create mode 100644 datasets/xquad.json create mode 100644 datasets/xquad_r.json create mode 100644 datasets/xsid_-_(x)_slot_and_intent_detection.json create mode 100644 datasets/xtreme.json create mode 100644 datasets/zaebuc.json diff --git a/datasets/101_billion_arabic_words_dataset.json b/datasets/101_billion_arabic_words_dataset.json new file mode 100644 index 0000000..7b17c4c --- /dev/null +++ b/datasets/101_billion_arabic_words_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "101 Billion Arabic Words Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/ClusterlabAi/101_billion_arabic_words_dataset", + "Link": "https://hf.co/datasets/ClusterlabAi/101_billion_arabic_words_dataset", + "License": "Apache-2.0", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "The 101 Billion Arabic Words Dataset is curated by the Clusterlab team and consists of 101 billion words extracted and cleaned from web content, specifically targeting Arabic text. This dataset is intended for use in natural language processing applications, particularly in training and fine-tuning Large Language Models (LLMs) capable of understanding and generating Arabic text.", + "Volume": "101,000,000,000", + "Unit": "tokens", + "Ethical Risks": "High", + "Provider": "Clusterlab", + "Derived From": "Common Crawl", + "Paper Title": "101 Billion Arabic Words Dataset", + "Paper Link": "https://arxiv.org/pdf/2405.01590v1", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Manel Aloui, Hasna Chouikhi, Ghaith Chaabane, Haithem Kchaou, and Chehir Dhaouadi", + "Affiliations": "Clusterlab", + "Abstract": "In recent years, Large Language Models (LLMs) have revolutionized the field of natural language processing, showcasing an impressive rise predominantly in English-centric domains. These advancements have set a global benchmark, inspiring significant efforts toward developing Arabic LLMs capable of understanding and generating the Arabic language with remarkable accuracy. Despite these advancements, a critical challenge persists: the potential bias in Arabic LLMs, primarily attributed to their reliance on datasets comprising English data that has been translated into Arabic. This reliance not only compromises the authenticity of the generated content but also reflects a broader issue\u2014the scarcity of original quality Arabic linguistic data. This study aims to address the data scarcity in the Arab world and to encourage the development of Arabic Language Models that are true to both the linguistic and nuances of the region. We undertook a large-scale data mining project, extracting a substantial volume of text from the Common Crawl WET files, specifically targeting Arabic content. The extracted data underwent a rigorous cleaning and deduplication process, using innovative techniques to ensure the integrity and uniqueness of the dataset. The result is the 101 Billion Arabic Words Dataset, the largest Arabic dataset available to date, which can significantly contribute to the development of authentic Arabic LLMs. This study not only highlights the potential for creating linguistically and culturally accurate Arabic LLMs but also sets a precedent for future research in enhancing the authenticity of Arabic language models.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/1993-2007_united_nations_parallel_text.json b/datasets/1993-2007_united_nations_parallel_text.json new file mode 100644 index 0000000..6ec5b07 --- /dev/null +++ b/datasets/1993-2007_united_nations_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "1993-2007 United Nations Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T06", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The data is presented as raw text and word-aligned text. The raw text is very close to what was extracted from the original word processing documents in UN ODS (e.g., Word, WordPerfect, PDF), converted to UTF-8 encoding.", + "Volume": "520,283", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "175.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/1997_hub5_arabic_evaluation.json b/datasets/1997_hub5_arabic_evaluation.json new file mode 100644 index 0000000..8d003e0 --- /dev/null +++ b/datasets/1997_hub5_arabic_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "1997 HUB5 Arabic Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2002S22", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "This publication contains 20 sphere files encoded in two channel interleaved mulaw with a sampling rate of 8 KHz, for a total of 424,160,000 bytes (405 Mbytes) of sphere data. The sphere headers have been modified from the original Evaluation data by the addition of sample checksums to the CALLHOME data files.", + "Volume": "20", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/1997_hub5_arabic_transcripts.json b/datasets/1997_hub5_arabic_transcripts.json new file mode 100644 index 0000000..ca9632d --- /dev/null +++ b/datasets/1997_hub5_arabic_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "1997 HUB5 Arabic Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2002T39", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "There are 40 data files. Each of the 20 calls has transcripts in two formats: .txt and .scr.", + "Volume": "40", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2003_nist_language_recognition_evaluation.json b/datasets/2003_nist_language_recognition_evaluation.json new file mode 100644 index 0000000..7e835c2 --- /dev/null +++ b/datasets/2003_nist_language_recognition_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "2003 NIST Language Recognition Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006S31", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Each speech file is one side of a \"four wire\" telephone conversation represented as 8-bit, 8-kHz mulaw data. There are 11,830 speech files in SPHERE (.sph) format. The speech data was compiled from LDC's CALLFRIEND, CALLHOME, and Switchboard-2 corpora. Each file contains one test segment. The test segments are divided into three-second, 10-second, and 30-second tests, each in its own directory.", + "Volume": "46", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2003_nist_rich_transcription_evaluation_data.json b/datasets/2003_nist_rich_transcription_evaluation_data.json new file mode 100644 index 0000000..a9d4327 --- /dev/null +++ b/datasets/2003_nist_rich_transcription_evaluation_data.json @@ -0,0 +1,36 @@ +{ + "Name": "2003 NIST Rich Transcription Evaluation Data", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007S10", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The BN datasets were selected from TDT-4 sources collected in February 2001. The evaluation excerpts were transcribed to the nearest story boundary. The English BN dataset is approximately three hours long and is composed of 30-minute excerpts from six different broadcasts. The Mandarin Chinese BN dataset is approximately one hour long, consisting of 12-minute excerpts from five different broadcasts. The Arabic BN dataset is also approximately one hour long and contains 30-minute excerpts from two different broadcasts.", + "Volume": "1", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2005_nist_speaker_recognition_evaluation_test_data.json b/datasets/2005_nist_speaker_recognition_evaluation_test_data.json new file mode 100644 index 0000000..22b3fde --- /dev/null +++ b/datasets/2005_nist_speaker_recognition_evaluation_test_data.json @@ -0,0 +1,36 @@ +{ + "Name": "2005 NIST Speaker Recognition Evaluation Test Data", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S04", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data consists of conversational telephone speech with multi-channel data collected by LDC simultaneously from a number of auxiliary microphones. The files are organized into two segments: 10 second two-channel excerpts (continuous segments from single conversations that are estimated to contain approximately 10 seconds of actual speech in the channel of interest) and five minute two-channel conversations.", + "Volume": "525", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "400.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2005_nist_speaker_recognition_evaluation_training_data.json b/datasets/2005_nist_speaker_recognition_evaluation_training_data.json new file mode 100644 index 0000000..08e5335 --- /dev/null +++ b/datasets/2005_nist_speaker_recognition_evaluation_training_data.json @@ -0,0 +1,36 @@ +{ + "Name": "2005 NIST Speaker Recognition Evaluation Training Data", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S01", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data consists of conversational telephone speech with multi-channel data collected simultaneously from a number of auxiliary microphones. The files are organized into two segments: 10 second two-channel excerpts (continuous segments from single conversations that are estimated to contain approximately 10 seconds of actual speech in the channel of interest) and five minute two-channel conversations.", + "Volume": "392", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "350.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2006_conll_shared_task_-_arabic_&_czech.json b/datasets/2006_conll_shared_task_-_arabic_&_czech.json new file mode 100644 index 0000000..0a4d8e3 --- /dev/null +++ b/datasets/2006_conll_shared_task_-_arabic_&_czech.json @@ -0,0 +1,36 @@ +{ + "Name": "2006 CoNLL Shared Task - Arabic & Czech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015T12", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "2006 CoNLL Shared Task - Arabic & Czech consists of Arabic and Czech dependency treebanks used as part of the CoNLL 2006 shared task on multi-lingual dependency parsing.", + "Volume": "nan", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "PADT", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "syntactic parsing", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2006_nist_speaker_recognition_evaluation_test_set_part_1.json b/datasets/2006_nist_speaker_recognition_evaluation_test_set_part_1.json new file mode 100644 index 0000000..9f1c490 --- /dev/null +++ b/datasets/2006_nist_speaker_recognition_evaluation_test_set_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "2006 NIST Speaker Recognition Evaluation Test Set Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S10", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data in this release was collected by LDC as part of the Mixer project, in particular Mixer Phases 1, 2, and 3. The Mixer project supports the development of robust speaker recognition technology by providing carefully collected and audited speech from a large pool of speakers recorded simultaneously across numerous microphones and in different communicative situations and/or in multiple languages. The data is mostly English speech, but includes some speech in Arabic, Bengali, Chinese, Farsi, Hindi, Korean, Russian, Spanish, Thai, and Urdu.", + "Volume": "437", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "300.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2006_nist_speaker_recognition_evaluation_test_set_part_2.json b/datasets/2006_nist_speaker_recognition_evaluation_test_set_part_2.json new file mode 100644 index 0000000..fae5a9a --- /dev/null +++ b/datasets/2006_nist_speaker_recognition_evaluation_test_set_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "2006 NIST Speaker Recognition Evaluation Test Set Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012S01", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data in this release was collected by LDC as part of the Mixer project, in particular Mixer Phases 1, 2, and 3. The Mixer project supports the development of robust speaker recognition technology by providing carefully collected and audited speech from a large pool of speakers recorded simultaneously across numerous microphones and in different communicative situations and/or in multiple languages. The data is mostly English speech, but includes some speech in Arabic, Bengali, Chinese, Farsi, Hindi, Korean, Russian, Spanish, Thai, and Urdu.", + "Volume": "568", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "350.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2006_nist_speaker_recognition_evaluation_training_set.json b/datasets/2006_nist_speaker_recognition_evaluation_training_set.json new file mode 100644 index 0000000..53edef5 --- /dev/null +++ b/datasets/2006_nist_speaker_recognition_evaluation_training_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2006 NIST Speaker Recognition Evaluation Training Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S09", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data in this release was collected by LDC as part of the Mixer project, in particular Mixer Phases 1, 2, and 3. The Mixer project supports the development of robust speaker recognition technology by providing carefully collected and audited speech from a large pool of speakers recorded simultaneously across numerous microphones and in different communicative situations and/or in multiple languages. The data is mostly English speech, but includes some speech in Arabic, Bengali, Chinese, Hindi, Korean, Russian, Thai, and Urdu.", + "Volume": "595", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "350.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2006_nist_spoken_term_detection_development_set.json b/datasets/2006_nist_spoken_term_detection_development_set.json new file mode 100644 index 0000000..5c6ff7a --- /dev/null +++ b/datasets/2006_nist_spoken_term_detection_development_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2006 NIST Spoken Term Detection Development Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S02", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The development corpus consists of three data genres: broadcast news (BNews), conversational telephone speech (CTS) and conference room meetings (CONFMTG). The broadcast news material was collected in 2001 by LDCs broadcast collection system from the following sources: ABC (English), China Broadcasting System (Chinese), China Central TV (Chinese), China National Radio (Chinese), China Television System (Chinese), CNN (English), MSNBC/NBC (English), Nile TV (Arabic), Public Radio International (English) and Voice of America (Arabic, Chinese, English). The CTS data was taken from the Switchboard data sets (e.g., Switchboard-2 Phase 1 LDC98S75, Switchboard-2 Phase 2 LDC99S79) and the Fisher corpora (e.g., Fisher English Training Speech Part 1 LDC2004S13), also collected by LDC. The conference room meeting material consists of goal-oriented, small group roundtable meetings and was collected in 2001, 2004 and 2005 by NIST, the International Computer Science Institute (Berkely, California), Carnegie Mellon University (Pittsburgh, PA) and Virginia Polytechnic Institute and State University (Blacksburg, VA) as part of the AMI corpus project.", + "Volume": "18", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "800.00 $", + "Test Split": "No", + "Tasks": "spoken term detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2006_nist_spoken_term_detection_evaluation_set.json b/datasets/2006_nist_spoken_term_detection_evaluation_set.json new file mode 100644 index 0000000..db60d36 --- /dev/null +++ b/datasets/2006_nist_spoken_term_detection_evaluation_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2006 NIST Spoken Term Detection Evaluation Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S03", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The evaluation corpus consists of three data genres: broadcast news (BNews), conversational telephone speech (CTS) and conference room meetings (CONFMTG). The broadcast news material was collected in 2003 and 2004 by LDCs broadcast collection system from the following sources: ABC (English), Aljazeera (Arabic), China Central TV (Chinese), CNN (English), CNBC (English), Dubai TV (Arabic), New Tang Dynasty TV (Chinese), Public Radio International (English) and Radio Free Asia (Chinese). The CTS data was taken from the Switchboard data sets (e.g., Switchboard-2 Phase 1 LDC98S75, Switchboard-2 Phase 2 LDC99S79) and the Fisher corpora (e.g., Fisher English Training Speech Part 1 LDC2004S13), also collected by LDC. The conference room meeting material consists of goal-oriented, small group roundtable meetings and was collected in 2004 and 2005 by NIST, the International Computer Science Institute (Berkeley, California), Carnegie Mellon University (Pittsburgh, PA), TNO (The Netherlands) and Virginia Polytechnic Institute and State University (Blacksburg, VA) as part of the AMI corpus project. ", + "Volume": "18", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "800.00 $", + "Test Split": "No", + "Tasks": "spoken term detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2007_conll_shared_task_-_arabic_&_english.json b/datasets/2007_conll_shared_task_-_arabic_&_english.json new file mode 100644 index 0000000..d6b2c19 --- /dev/null +++ b/datasets/2007_conll_shared_task_-_arabic_&_english.json @@ -0,0 +1,36 @@ +{ + "Name": "2007 CoNLL Shared Task - Arabic & English", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018T08", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The source data in the treebanks in this release consists principally of various texts (e.g., textbooks, news, literature) annotated in dependency format. In general, dependency grammar is based on the idea that the verb is the center of the clause structure and that other units in the sentence are connected to the verb as directed links or dependencies. This is a one-to-one correspondence: for every element in the sentence there is one node in the sentence structure that corresponds to that element. In constituency or phrase structure grammars, on the other hand, clauses are divided into noun phrases and verb phrases and in each sentence, one or more nodes may correspond to one element. The Penn Treebank (LDC99T42) is an example of a constituency or phrase structure approach. All of the data sets in this release are dependency treebanks.", + "Volume": "nan", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "syntactic parsing", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2007_nist_language_recognition_evaluation_supplemental_training_set.json b/datasets/2007_nist_language_recognition_evaluation_supplemental_training_set.json new file mode 100644 index 0000000..ee9b954 --- /dev/null +++ b/datasets/2007_nist_language_recognition_evaluation_supplemental_training_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2007 NIST Language Recognition Evaluation Supplemental Training Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009S05", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The supplemental training material in this release consists of the following:", + "Volume": "118", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2007_nist_language_recognition_evaluation_test_set.json b/datasets/2007_nist_language_recognition_evaluation_test_set.json new file mode 100644 index 0000000..cad91e4 --- /dev/null +++ b/datasets/2007_nist_language_recognition_evaluation_test_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2007 NIST Language Recognition Evaluation Test Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009S04", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Each speech file in the test data is one side of a 4-wire telephone conversation represented as 8-bit 8-kHz mu-law format. There are 7530 speech files in SPHERE (.sph) format for a total of 66 hours of speech. The speech data was compiled from LDCs CALLFRIEND, Fisher Spanish, and Mixer 3 corpora and from data collected by Oregon Health and Science University (OHSU), Beaverton, Oregon.", + "Volume": "66", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2008_2010_nist_metrics_for_machine_translation_(metricsmatr)_gale_evaluation_set.json b/datasets/2008_2010_nist_metrics_for_machine_translation_(metricsmatr)_gale_evaluation_set.json new file mode 100644 index 0000000..e6d65ad --- /dev/null +++ b/datasets/2008_2010_nist_metrics_for_machine_translation_(metricsmatr)_gale_evaluation_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2008/2010 NIST Metrics for Machine Translation (MetricsMaTr) GALE Evaluation Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011T05", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "This release contains 149 documents with corresponding reference translations (Arabic-to-English and Chinese-to-English), system translations and human assessments. The human assessments include the following: Adequacy7 (a 7-point scale for judging the meaning of a system translation with respect to the reference translation) Adequacy Yes/No (whether the given system segment meant essentially the same as the reference translation) Preference (the judges preference between two candidate translations when compared to a human reference translation) and HTER (Human Targeted Error Rate, human edits to a system translation to have the same meaning as a reference translation).", + "Volume": "149", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "250.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2008_nist_metrics_for_machine_translation_(metricsmatr08)_development_data.json b/datasets/2008_nist_metrics_for_machine_translation_(metricsmatr08)_development_data.json new file mode 100644 index 0000000..fce0cdb --- /dev/null +++ b/datasets/2008_nist_metrics_for_machine_translation_(metricsmatr08)_development_data.json @@ -0,0 +1,36 @@ +{ + "Name": "2008 NIST Metrics for Machine Translation (MetricsMATR08) Development Data", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T05", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The MetricsMATR08 development data set released here is reflective of the test data set only to a degree; the evaluation data set contains more varied data -- from more genres, more source languages, more systems and different evaluations -- than this development data set. There are also more types of human assessments for the test data. The MetricsMATR08 test data remains unseen to allow for repeated use as test data.", + "Volume": "249", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2008_nist_speaker_recognition_evaluation_test_set.json b/datasets/2008_nist_speaker_recognition_evaluation_test_set.json new file mode 100644 index 0000000..f46b7d0 --- /dev/null +++ b/datasets/2008_nist_speaker_recognition_evaluation_test_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2008 NIST Speaker Recognition Evaluation Test Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S08", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data in this release was collected in 2007 by LDC at its Human Subjects Collection facility in Philadelphia and by the International Computer Science Institute (ICSI) at the University of California, Berkeley. This collection was part of the Mixer 5 project, which was designed to support the development of robust speaker recognition technology by providing carefully collected and audited speech from a large pool of speakers recorded simultaneously across numerous microphones and in different communicative situations and/or in multiple languages. Mixer participants were native English and bilingual English speakers. The telephone speech in this corpus is predominantly English, but also includes the above languages. All interview segments are in English. Telephone speech represents approximately 368 hours of the data, whereas microphone speech represents the other 574 hours.", + "Volume": "942", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "600.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2008_nist_speaker_recognition_evaluation_training_set_part_1.json b/datasets/2008_nist_speaker_recognition_evaluation_training_set_part_1.json new file mode 100644 index 0000000..c68898b --- /dev/null +++ b/datasets/2008_nist_speaker_recognition_evaluation_training_set_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "2008 NIST Speaker Recognition Evaluation Training Set Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S05", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data in this release was collected in 2007 by LDC at its Human Subjects Collection facility in Philadelphia and by the International Computer Science Institute (ICSI) at the University of California, Berkley. This collection was part of the Mixer 5 project, which was designed to support the development of robust speaker recognition technology by providing carefully collected and audited speech from a large pool of speakers recorded simultaneously across numerous microphones and in different communicative situations and/or in multiple languages.", + "Volume": "640", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "400.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2008_nist_speaker_recognition_evaluation_training_set_part_2.json b/datasets/2008_nist_speaker_recognition_evaluation_training_set_part_2.json new file mode 100644 index 0000000..da9ac77 --- /dev/null +++ b/datasets/2008_nist_speaker_recognition_evaluation_training_set_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "2008 NIST Speaker Recognition Evaluation Training Set Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011S07", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The speech data in this release was collected in 2007 by LDC at its Human Subjects Data Collection Laboratories in Philadelphia and by the International Computer Science Institute (ICSI) at the University of California, Berkeley. This collection was part of the Mixer 5 project, which was designed to support the development of robust speaker recognition technology by providing carefully collected and audited speech from a large pool of speakers recorded simultaneously across numerous microphones and in different communicative situations and/or in multiple languages. Mixer participants were native English speakers and bilingual English speakers. The telephone speech in this corpus is predominately English, but also includes the above languages. All interview segments are in English. Telephone speech represents approximately 523 hours of the data, and microphone speech represents the other 427 hours.", + "Volume": "950", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "600.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2011_nist_language_recognition_evaluation_test_set.json b/datasets/2011_nist_language_recognition_evaluation_test_set.json new file mode 100644 index 0000000..9ddcf19 --- /dev/null +++ b/datasets/2011_nist_language_recognition_evaluation_test_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2011 NIST Language Recognition Evaluation Test Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018S06", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "This release includes training data for nine language varieties that had not been represented in prior LRE cycles -- Arabic (Iraqi), Arabic (Levantine), Arabic (Maghrebi), Arabic (Standard), Czech, Lao, Punjabi, Polish, and Slovak -- contained in 893 audited segments of roughly 30 seconds duration and in 400 full-length CTS recordings. The evaluation test set comprises a total of 29,511 audio files, all manually audited at LDC for language and divided equally into three different test conditions according to the nominal amount of speech content per segment.", + "Volume": "204", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/2018_nist_speaker_recognition_evaluation_test_set.json b/datasets/2018_nist_speaker_recognition_evaluation_test_set.json new file mode 100644 index 0000000..8f047c5 --- /dev/null +++ b/datasets/2018_nist_speaker_recognition_evaluation_test_set.json @@ -0,0 +1,36 @@ +{ + "Name": "2018 NIST Speaker Recognition Evaluation Test Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2020S04", + "License": "LDC User Agreement for Non-Members", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The telephone speech data was drawn from the Call My Net 2 (CMN2) collection conducted by LDC in Tunisia in which Tunisian Arabic speakers called friends or relatives who agreed to record their telephone conversations lasting between 8-10 minutes. The speech segments include PSTN (public switched telephone network) and VOIP (voice over IP) data.", + "Volume": "396", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "750.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/a-speechdb.json b/datasets/a-speechdb.json new file mode 100644 index 0000000..ace736e --- /dev/null +++ b/datasets/a-speechdb.json @@ -0,0 +1,36 @@ +{ + "Name": "A-SpeechDB", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0315/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2011, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "A-SpeechDB\u00a9 is an Arabic speech database suited for training acoustic models for Arabic phoneme-based speaker-independent automatic speech recognition systems. The database contains about 20 hours of continuous speech recorded through one desktop omni microphone by 205 native speakers from Egypt (about 30% of females and 70% of males), aged between 20 and 45.", + "Volume": "20", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "1,000.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/a7'ta.json b/datasets/a7'ta.json new file mode 100644 index 0000000..c3f484b --- /dev/null +++ b/datasets/a7'ta.json @@ -0,0 +1,36 @@ +{ + "Name": "A7'ta", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/A7ta", + "Link": "https://github.com/iwan-rg/A-Monolingual-Arabic-Parallel-Corpus-", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling", + "Description": "The data contains 300 documents, 445 erroneous sentences and their error-free counterparts, and a total of 3,532 words. ", + "Volume": "300", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "King Saud University", + "Derived From": "nan", + "Paper Title": "A7\u05f3ta: Data on a monolingual Arabic parallel corpus for grammar checking", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S2352340918315397", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "grammar checking", + "Venue Title": "Data in brief", + "Citations": "2.0", + "Venue Type": "journal", + "Venue Name": "Data in brief", + "Authors": "Nora Madi,Hend Suliman Al-Khalifa", + "Affiliations": ",", + "Abstract": "Grammar error correction can be considered as a \u201ctranslation\u201d problem, such that an erroneous sentence is \u201ctranslated\u201d into a correct version of the sentence in the same language. This can be accomplished by employing techniques like Statistical Machine Translation (SMT) or Neural Machine Translation (NMT). Producing models for SMT or NMT for the goal of grammar correction requires monolingual parallel corpora of a certain language. This data article presents a monolingual parallel corpus of Arabic text called A7\u05f3ta (). It contains 470 erroneous sentences and their 470 error-free counterparts. This is an Arabic parallel corpus that can be used as a linguistic resource for Arabic natural language processing (NLP) mainly to train sequence-to-sequence models for grammar checking. Sentences were manually collected from a book that has been prepared as a guide for correctly writing and using Arabic grammar and other linguistic features. Although there are a number of available Arabic corpora of errors and corrections [2] such as QALB [10] and Arabic Learner Corpus [11], the data we present in this article is an effort to increase the number of freely available Arabic corpora of errors and corrections by providing a detailed error specification and leveraging the work of language experts.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/a_corpus_of_arabic_literature_(19-20th_centuries)_for_stylometric_tests.json b/datasets/a_corpus_of_arabic_literature_(19-20th_centuries)_for_stylometric_tests.json new file mode 100644 index 0000000..a412d46 --- /dev/null +++ b/datasets/a_corpus_of_arabic_literature_(19-20th_centuries)_for_stylometric_tests.json @@ -0,0 +1,36 @@ +{ + "Name": "A Corpus of Arabic Literature (19-20th centuries) for Stylometric Tests", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_Literature", + "Link": "https://zenodo.org/record/5772261#.YqTaodrMLIV", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling", + "Description": "The dataset contains three collections of mainly literary Arabic texts from the 19th and early 20th centuries. corpus022_JurjiZaydan_Dated is a dated corpus of 22 historical novels by Jurj\u012b Zayd\u0101n. It is well established that Jurj\u012b Zayd\u0101n was publishing roughly one novel per year and the dates of publication are well known, which makes this corpus a valuable material for testing chronological changes in the style of individual writers. corpus065 is a corpus of 65 books by 8 authors; corpus300 contains 300 books by 28 authors;", + "Volume": "387", + "Unit": "documents", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "stylometric tests", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Romanov Maxim", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/absa-hotels.json b/datasets/absa-hotels.json new file mode 100644 index 0000000..7044bad --- /dev/null +++ b/datasets/absa-hotels.json @@ -0,0 +1,36 @@ +{ + "Name": "ABSA-Hotels", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/msmadi/ABSA-Hotels", + "License": "MIT License", + "Year": 2016, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Around 15,562 Hotels' reviews were thoroughly reviewed by this research authors and a subset of 2,291 reviews were selected. The original dataset has been collected from well known Hotels' booking websites such as Booking.com, TripAdvisor.com.", + "Volume": "24,028", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "SemEval-2016 Task 5: Aspect Based Sentiment Analysis", + "Paper Link": "https://aclanthology.org/S16-1002.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "review classification ", + "Venue Title": "SemEval", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Workshop on Semantic Evaluation", + "Authors": "Maria Pontiki, Dimitrios Galanis, Haris Papageorgiou, Ion Androutsopoulos,Suresh Manandhar, Mohammad AL-Smadi, Mahmoud Al-Ayyoub, Yanyan Zhao,Bing Qin5, Orph\u00e9e De Clercq, V\u00e9ronique Hoste, Marianna Apidianaki,Xavier Tannier, Natalia Loukachevitch, Evgeny Kotelnikov,Nuria Bel, Salud Mar\u00eda Jim\u00e9nez-Zafra, G\u00fcl\u015fen Eryi\u011fit", + "Affiliations": "nan", + "Abstract": "This paper describes the SemEval 2016 shared\ntask on Aspect Based Sentiment Analysis\n(ABSA), a continuation of the respective tasks\nof 2014 and 2015. In its third year, the task\nprovided 19 training and 20 testing datasets\nfor 8 languages and 7 domains, as well as a\ncommon evaluation procedure. From these\ndatasets, 25 were for sentence-level and 14 for\ntext-level ABSA; the latter was introduced for\nthe first time as a subtask in SemEval. The task\nattracted 245 submissions from 29 teams.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ace_2004_multilingual_training_corpus.json b/datasets/ace_2004_multilingual_training_corpus.json new file mode 100644 index 0000000..b85a234 --- /dev/null +++ b/datasets/ace_2004_multilingual_training_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "ACE 2004 Multilingual Training Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "ACE 2004 Multilingual Training Corpus was developed by the Linguistic Data Consortium (LDC) and contains the various genre text in English (158,000 words), Chinese (307,000 characters, 154,000 words), and Arabic (151,000 words) annotated for entities and relations.", + "Volume": "689", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "named entity recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ace_2005_multilingual_training_corpus.json b/datasets/ace_2005_multilingual_training_corpus.json new file mode 100644 index 0000000..22476a8 --- /dev/null +++ b/datasets/ace_2005_multilingual_training_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "ACE 2005 Multilingual Training Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T06", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Below is information about the amount of data in this release and its annotation status. Further information such as breakdown of genres and formats can be found in the associated README file.", + "Volume": "433", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ace_2007_multilingual_training_corpus.json b/datasets/ace_2007_multilingual_training_corpus.json new file mode 100644 index 0000000..ebd7802 --- /dev/null +++ b/datasets/ace_2007_multilingual_training_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "ACE 2007 Multilingual Training Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The Arabic data is composed of newswire (60%) published in October 2000-December 2000 and weblogs (40%) published during the period November 2004-February 2005. The Spanish data set consists entirely of newswire material from multiple sources published in January 2005-April 2005.", + "Volume": "98,353", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/acqad.json b/datasets/acqad.json new file mode 100644 index 0000000..d087dbc --- /dev/null +++ b/datasets/acqad.json @@ -0,0 +1,36 @@ +{ + "Name": "ACQAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/acqad_multihop,https://hf.co/datasets/arbml/acqad_comparison", + "Link": "https://www.kaggle.com/datasets/abdellahhamouda/acqad-dataset", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "ACQAD, contains more than 118k questions, covering both comparison and multi-hop types. Each question-answer pair is decomposed into a set of single-hop questions, allowing QA systems to reduce question complexity and explain the reasoning steps.", + "Volume": "118,841", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "EMP Algeria , LORIA France", + "Derived From": "nan", + "Paper Title": "ACQAD: A Dataset for Arabic Complex Question Answering", + "Paper Link": "https://hal.science/hal-03992129v1/preview/ICCSAITCS_2022_paper_9032.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "0", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "ICCSAITCS", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": ". International Conference on Cyber Security, Artificial Intelligence and Theoretical Computer Science", + "Authors": "Abdellah HAMOUDA SIDHOUM, M'Hamed MATAOUI, Faouzi SEBBAk, Kamel SMAILI", + "Affiliations": "EMP Algeria, LORIA France", + "Abstract": "In this paper, we tackle the problem of Arabic complex Question Answering (QA), where models are required to reason over multiple documents to find the answer. Indeed, no Arabic dataset is available for this type of questions. To fill this lack, we propose a new approach to\nautomatically generate a dataset for Arabic complex question answering task. The proposed approach is based on using an effective workflow with a set of templates. The generated dataset, denoted as ACQAD, contains more than 118k questions, covering both comparison and multi-hop types. Each question-answer pair is decomposed into a set of single-hop\nquestions, allowing QA systems to reduce question complexity and explain the reasoning steps. We then provide a statistical analysis of the produced dataset. Afterwards, we will make the corpus available to the international community.", + "Added By": "Abdellah HAMOUDA SIDHOUM" +} \ No newline at end of file diff --git a/datasets/adcc.json b/datasets/adcc.json new file mode 100644 index 0000000..32334d9 --- /dev/null +++ b/datasets/adcc.json @@ -0,0 +1,36 @@ +{ + "Name": "ADCC", + "Subsets": [], + "HF Link": "nan", + "Link": "https://adccorpus.wixsite.com/site/single-post/2016/11/30/what-is-adcc", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Arabic Daily Comunication Corpus (ADCC) is daily conversations written text in Modern Standard Arabic which was collected from different resources.", + "Volume": "4,000,000", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Towards Intelligent Arabic Text-to-Speech Application for Disabled People", + "Paper Link": "https://ieeexplore.ieee.org/document/7899133", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, topic classification, information retrieval, natural language inference, text-to-speech translation", + "Venue Title": "ICIHT", + "Citations": "1.0", + "Venue Type": "conference", + "Venue Name": "Proceeding of International Conference on Informatics, Health & Technology", + "Authors": "Amal Alsaif, Njoud Albadrani, Ashwag Alamro, Reham Alsaif", + "Affiliations": "Kingdom of Saudi Arabia Ministry of Education Al-Imam Muhammad Ibn Saud Islamic University", + "Abstract": "Assistive technology customizes speech technology to offer a new communication channel for disabled people such as blind or having speech difficulties. Converting written text into natural speech has been addressed in the last decades for some languages such as English, hence, used in many applications such as voice answering machines, reading articles and exploring software for blind people. Other languages such as Arabic are still not fully served to have high quality Text-To-Speech applications. This paper describes our effort in developing an intelligent Text-To-Speech mobile application for Arabic. We use a set of statistical language models n-gram for word prediction and auto-completion for easy typing. A large new Arabic corpus for daily communication in different domains is constructed which could be used for other purposes. A serious of normalization processing, including spelling correction, is applied to the corpus to maintain the consistency and unify the occurrence of the same words. We use outsource Sakhr Arabic Text-To-Speeh voices as one of the best speech synthesizer exist for Arabic. To ensure a high usability of the application, we use simple graphical user interface and easy access libraries to favorite phrases with an ability of adding pictures with recorded speech. Our experiments shows that word prediction using global and local corpus decries 50% of keystroke of typing desired sentences with a high prediction of 84% of bigram model.", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file diff --git a/datasets/adi-17.json b/datasets/adi-17.json new file mode 100644 index 0000000..abd4dec --- /dev/null +++ b/datasets/adi-17.json @@ -0,0 +1,139 @@ +{ + "Name": "ADI-17", + "Subsets": [ + { + "Name": "Algeria", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "115.7", + "Unit": "hours" + }, + { + "Name": "Egypt", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "451.1", + "Unit": "hours" + }, + { + "Name": "Iraq", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "815.8", + "Unit": "hours" + }, + { + "Name": "Jordan", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "25.9", + "Unit": "hours" + }, + { + "Name": "Saudi Arabia", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "186.1", + "Unit": "hours" + }, + { + "Name": "Kuwait", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Volume": "108.2", + "Unit": "hours" + }, + { + "Name": "Lebanon", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "116.8", + "Unit": "hours" + }, + { + "Name": "Libya", + "Dialect": "ar-LY: (Arabic (Libya))", + "Volume": "127.4", + "Unit": "hours" + }, + { + "Name": "Mauritania", + "Dialect": "ar-MR: (Arabic (Mauritania))", + "Volume": "456.4", + "Unit": "hours" + }, + { + "Name": "Morocco", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "57.8", + "Unit": "hours" + }, + { + "Name": "Oman", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "58.5", + "Unit": "hours" + }, + { + "Name": "Palestine", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "121.4", + "Unit": "hours" + }, + { + "Name": "Qatar", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "62.3", + "Unit": "hours" + }, + { + "Name": "Sudan", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "47.7", + "Unit": "hours" + }, + { + "Name": "Syria", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "119.5", + "Unit": "hours" + }, + { + "Name": "UAE", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "108.4", + "Unit": "hours" + }, + { + "Name": "Yemen", + "Dialect": "ar-YE: (Arabic (Yemen))", + "Volume": "53.4", + "Unit": "hours" + } + ], + "HF Link": "nan", + "Link": "https://arabicspeech.org/mgb5/#adi17", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "dialect identification of speech from YouTube to one of the 17 dialects", + "Volume": "3,091", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "The MGB-5 Challenge: Recognition and Dialect Identification of Dialectal Arabic Speech", + "Paper Link": "https://ieeexplore.ieee.org/document/9003960", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "dialect identification", + "Venue Title": "ASRU", + "Citations": "18.0", + "Venue Type": "workshop", + "Venue Name": "IEEE Automatic Speech Recognition and Understanding Workshop", + "Authors": "A. Ali,Suwon Shon,Younes Samih,Hamdy Mubarak,Ahmed Abdelali,James R. Glass,S. Renals,K. Choukri", + "Affiliations": ",,University Of D\u00fcsseldorf;Computational Linguistics,,,,,", + "Abstract": "This paper describes the fifth edition of the Multi-Genre Broadcast Challenge (MGB-5), an evaluation focused on Arabic speech recognition and dialect identification. MGB-5 extends the previous MGB-3 challenge in two ways: first it focuses on Moroccan Arabic speech recognition; second the granularity of the Arabic dialect identification task is increased from 5 dialect classes to 17, by collecting data from 17 Arabic speaking countries. Both tasks use YouTube recordings to provide a multi-genre multi-dialectal challenge in the wild. Moroccan speech transcription used about 13 hours of transcribed speech data, split across training, development, and test sets, covering 7-genres: comedy, cooking, family/kids, fashion, drama, sports, and science (TEDx). The fine-grained Arabic dialect identification data was collected from known YouTube channels from 17 Arabic countries. 3,000 hours of this data was released for training, and 57 hours for development and testing. The dialect identification data was divided into three sub-categories based on the segment duration: short (under 5 s), medium (5\u201320 s), and long (>20 s). Overall, 25 teams registered for the challenge, and 9 teams submitted systems for the two tasks. We outline the approaches adopted in each system and summarize the evaluation results.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/adi-5.json b/datasets/adi-5.json new file mode 100644 index 0000000..fa62c0d --- /dev/null +++ b/datasets/adi-5.json @@ -0,0 +1,67 @@ +{ + "Name": "ADI-5", + "Subsets": [ + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "14.4", + "Unit": "hours" + }, + { + "Name": "Gulf", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "14.1", + "Unit": "hours" + }, + { + "Name": "Levantine", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "14.3", + "Unit": "hours" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "14.3", + "Unit": "hours" + }, + { + "Name": "North African", + "Dialect": "ar-NOR: (Arabic (North Africa))", + "Volume": "14.6", + "Unit": "hours" + } + ], + "HF Link": "nan", + "Link": "https://arabicspeech.org/mgb3-adi/", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "This will be divided across the five major Arabic dialects; Egyptian (EGY), Levantine (LAV), Gulf (GLF), North African (NOR), and Modern Standard Arabic (MSA)", + "Volume": "50", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Automatic Dialect Detection in Arabic Broadcast Speech\r", + "Paper Link": "https://arxiv.org/pdf/1509.06928.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "INTERSPEECH", + "Citations": "93.0", + "Venue Type": "conference", + "Venue Name": "Conference of the International Speech Communication Association", + "Authors": "A. Ali,Najim Dehak,P. Cardinal,Sameer Khurana,S. Yella,James R. Glass,P. Bell,S. Renals", + "Affiliations": ",,,,,,,", + "Abstract": "We investigate different approaches for dialect identification in Arabic broadcast speech, using phonetic, lexical features obtained from a speech recognition system, and acoustic features using the i-vector framework. We studied both generative and discriminate classifiers, and we combined these features using a multi-class Support Vector Machine (SVM). We validated our results on an Arabic/English language identification task, with an accuracy of 100%. We used these features in a binary classifier to discriminate between Modern Standard Arabic (MSA) and Dialectal Arabic, with an accuracy of 100%. We further report results using the proposed method to discriminate between the five most widely used dialects of Arabic: namely Egyptian, Gulf, Levantine, North African, and MSA, with an accuracy of 52%. We discuss dialect identification errors in the context of dialect code-switching between Dialectal Arabic and MSA, and compare the error pattern between manually labeled data, and the output from our classifier. We also release the train and test data as standard corpus for dialect identification.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/adpbc.json b/datasets/adpbc.json new file mode 100644 index 0000000..da70c20 --- /dev/null +++ b/datasets/adpbc.json @@ -0,0 +1,36 @@ +{ + "Name": "ADPBC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ADPBC", + "Link": "https://github.com/salsama/Arabic-Information-Extraction-Corpus", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This corpus contains the words and their dependency relation produced by performing some steps", + "Volume": "16", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "ADPBC: Arabic Dependency Parsing Based\nCorpora for Information Extraction", + "Paper Link": "http://www.mecs-press.org/ijitcs/ijitcs-v13-n1/IJITCS-V13-N1-4.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "part of speech tagging, dependency parsing, topic classification", + "Venue Title": "IJITCS", + "Citations": "0.0", + "Venue Type": "journal", + "Venue Name": "International Journal of Information Technology and Computer Science", + "Authors": "Sally Mohamed,M. Hussien.,Hamdy M. Mousa", + "Affiliations": ",,", + "Abstract": "There is a massive amount of different information and data in the World Wide Web, and the number of Arabic users and contents is widely increasing. Information extraction is an essential issue to access and sort the data on the web. In this regard, information extraction becomes a challenge, especially for languages, which have a complex morphology like Arabic. Consequently, the trend today is to build a new corpus that makes the information extraction easier and more precise. This paper presents Arabic linguistically analyzed corpus, including dependency relation. The collected data includes five fields; they are a sport, religious, weather, news and biomedical. The output is CoNLL universal lattice file format (CoNLL-UL). The corpus contains an index for the sentences and their linguistic meta-data to enable quick mining and search across the corpus. This corpus has seventeenth morphological annotations and eight features based on the identification of the textual structures help to recognize and understand the grammatical characteristics of the text and perform the dependency relation. The parsing and dependency process conducted by the universal dependency model and corrected manually. The results illustrated the enhancement in the dependency relation corpus. The designed Arabic corpus helps to quickly get linguistic annotations for a text and make the information Extraction techniques easy and clear to learn. The gotten results illustrated the average enhancement in the dependency relation corpus.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json b/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json new file mode 100644 index 0000000..1b7e8b1 --- /dev/null +++ b/datasets/adult_content_detection_on_arabic_twitter__analysis_and_experiments.json @@ -0,0 +1,36 @@ +{ + "Name": "Adult Content Detection on Arabic Twitter: Analysis and Experiments", + "Subsets": [], + "HF Link": "nan", + "Link": "https://alt.qcri.org/resources/AdultContentDetection.zip", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Adult Content Detection on Arabic Twitter", + "Volume": "50,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Adult Content Detection on Arabic Twitter: Analysis and Experiments", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.14.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "adult language detection", + "Venue Title": "arXiv", + "Citations": "5.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Hamdy Mubarak, Sabit Hassan and Ahmed Abdelali", + "Affiliations": "Qatar Computing Research Institute", + "Abstract": "With Twitter being one of the most popular social media platforms in the Arab region, it is not surprising to find accounts that post adult content in Arabic tweets; despite the fact that these platforms dissuade users from such content. In this paper, we present a dataset of Twitter accounts that post adult content. We perform an in-depth analysis of the nature of this data and contrast it with normal tweet content. Additionally, we present extensive experiments with traditional machine learning models, deep neural networks and contextual embeddings to identify such accounts. We show that from user information alone, we can identify such accounts with F1 score of 94.7% (macro average). With the addition of only one tweet as input, the F1 score rises to 96.8%.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/afewc.json b/datasets/afewc.json new file mode 100644 index 0000000..26c5b77 --- /dev/null +++ b/datasets/afewc.json @@ -0,0 +1,36 @@ +{ + "Name": "AFEWC ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AFEWC", + "Link": "https://sourceforge.net/projects/crlcl/", + "License": "CC BY-NC 2.0", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "a multilingual comparable text articles in Arabic, French, and English languages", + "Volume": "40,290", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "INRIA", + "Derived From": "nan", + "Paper Title": "Extracting Comparable Articles from Wikipedia and Measuring their\nComparabilities", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S1877042813041402", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "PSBS", + "Citations": "21.0", + "Venue Type": "journal", + "Venue Name": "Procedia Social and Behavioral Sciences", + "Authors": "Motaz K. Saad,David Langlois,Kamel Sma", + "Affiliations": ",,", + "Abstract": "Parallel corpora are not available for all domains and languages, but statistical methods in multilingual research domains require huge parallel/comparable corpora. Comparable corpora can be used when the parallel is not sufficient or not available for specific domains and languages. In this paper, we propose a method to extract all comparable articles from Wikipedia for multiple languages based on interlanguge links. We also extract comparable articles from Euro News website. We also present two comparability measures (CM) to compute the degree of comparability of multilingual articles. We extracted about 40K and 34K comparable articles from Wikipedia and Euro News respectively in three languages including Arabic, French, and English. Experimental results of comparability measures show that our measure can capture the comparability of multilingual corpora and allow to retrieve articles from different language concerning the same topic.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/afnd.json b/datasets/afnd.json new file mode 100644 index 0000000..aed50f9 --- /dev/null +++ b/datasets/afnd.json @@ -0,0 +1,36 @@ +{ + "Name": "AFND", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AFND", + "Link": "https://data.mendeley.com/datasets/67mhx6hhzd/1", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "AFND consists of 606912 public news articles that were scraped from 134 public news websites of 19 different Arab countries over a 6-month period using Python scripts.", + "Volume": "606,912", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "AFND: Arabic fake news dataset for the detection and classification of articles credibility", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "news credibility detection", + "Venue Title": "Data in Brief", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Data in Brief", + "Authors": "Ashwaq Khalila, Moath Jarrah, Monther Aldwairi, ManarJaradat", + "Affiliations": "Department of Computer Engineering, Jordan University of Science and Technology, Department of Computer Engineering, Jordan University of Science and Technology,College of Technological Innovation, Zayed University, Department of Computer Engineering, The Hashemite University ", + "Abstract": "The news credibility detection task has started to gain more attention recently due to the rapid increase of news on different social media platforms. This article provides a large, labeled, and diverse Arabic Fake News Dataset (AFND) that is collected from public Arabic news websites. This dataset enables the research community to use supervised and unsupervised machine learning algorithms to classify the credibility of Arabic news articles. AFND consists of 606912 public news articles that were scraped from 134 public news websites of 19 different Arab countries over a 6-month period using Python scripts. The Arabic fact-check platform, Misbar, is used manually to classify each public news source into credible, not credible, or undecided. Weak supervision is applied to label news articles with the same label as the public source. AFND is imbalanced in the number of articles in each class. Hence, it is useful for researchers who focus on finding solutions for imbalanced datasets. The dataset is available in JSON format and can be accessed from Mendeley Data repository.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/afrd__arabic_fake_reviews_detection.json b/datasets/afrd__arabic_fake_reviews_detection.json new file mode 100644 index 0000000..d320839 --- /dev/null +++ b/datasets/afrd__arabic_fake_reviews_detection.json @@ -0,0 +1,36 @@ +{ + "Name": "AFRD: Arabic Fake Reviews Detection", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Noor0/AFRD_Arabic-Fake-Reviews-Detection", + "Link": "https://github.com/NoorAmer0/AFRD-arabic-fake-reviews-dataset", + "License": "CC BY 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Domain": "reviews", + "Form": "text", + "Collection Style": "other", + "Description": "Arabic Fake Reviews Detection (AFRD) is the first gold-standard dataset comprised of three domains, namely, hotel, restaurant, and product domains. Each domain has a set of attributes, the reviewer\u2019s age, the reviewer\u2019s gender, the service name, the review\u2019s text, the rating, the text\u2019s polarity, and the review\u2019s class. The overall balanced dataset is consisted of 1728 reviews, 310 reviews for the hotel domain, 714 reviews for the restaurant domain, and 704 reviews for the product domain, the two classes in each domain are balanced. However, there are unbalanced version with 1958 reviews.", + "Volume": "1,958", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Qassim University", + "Derived From": "nan", + "Paper Title": "Multiscale cascaded domain-based approach for Arabic fake reviews detection in e-commerce platforms", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S1319157824000156", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, dialect identification, review classification, gender identification", + "Venue Title": "JKSU", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Journal of King Saud University - Computer and Information Sciences", + "Authors": "Nour Qandos, Ghadir Hamad, Maitha Alharbi, Shatha Alturki, Waad Alharbi, Arwa A. Albelaihi", + "Affiliations": "nan", + "Abstract": "Fake reviews in e-commerce can lead to customer deception and financial losses. Despite the importance of fake reviews detection, studies for Arabic language are scarce due to the lack of comprehensive datasets. This study addresses this gap by introducing a full-gold standard dataset, the Arabic Fake Reviews Detection (AFRD), across hotels, restaurants, and product domains. To identify the most effective model for each domain in the context of fake review detection, this research employed Bi-LSTM, Bi-GRU, CNN+Bi-LSTM, and CNN+Bi-GRU models. These models were then used in a cascading approach called Multiscale Cascaded domain-based (MCDB), which transfers knowledge from one domain to enhance results in other domains. Experimental results demonstrated that the MCDB approach improved the results of the models by 2.09% to 7.8% in terms of accuracy. The introduced dataset can be used to build effective models for Arabic e-commerce platforms, in addition to further Natural Language Processing applications. This study demonstrates that leveraging domain-specific datasets in a cascading manner can significantly improve performance, holding substantial implications for future research in problems with limited-size datasets.", + "Added By": "Nour Qandos" +} \ No newline at end of file diff --git a/datasets/aghlat.json b/datasets/aghlat.json new file mode 100644 index 0000000..e6f0513 --- /dev/null +++ b/datasets/aghlat.json @@ -0,0 +1,36 @@ +{ + "Name": "Aghlat", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/aghlat", + "Link": "https://github.com/linuxscout/aghlat", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Arabic misspelling corpus", + "Volume": "331", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "misspelling detection, misspelling correction", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ags.json b/datasets/ags.json new file mode 100644 index 0000000..2628aa1 --- /dev/null +++ b/datasets/ags.json @@ -0,0 +1,36 @@ +{ + "Name": "AGS", + "Subsets": [], + "HF Link": "https://hf.co/datasets/FahdSeddik/AGS-Corpus", + "Link": "https://hf.co/datasets/FahdSeddik/AGS-Corpus", + "License": "CC BY-NC 4.0", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "AGS is the first publicly accessible abstractive summarization dataset for Arabic. It consists of 142,000 pairs of articles and summaries, all written in Modern Standard Arabic (MSA). The summaries are generated using GPT-3.5 Turbo, a large language model, through meticulous prompt engineering. The dataset covers a wide range of topics, such as politics, sports, culture, science, and technology.", + "Volume": "142,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "AGS: Arabic GPT Summarization Corpus", + "Paper Link": "https://ieeexplore.ieee.org/abstract/document/10441794", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "summarization", + "Venue Title": "ICECCE", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference on Electrical, Communication and Computer Engineering ", + "Authors": "Abdelrahman Atef, Fahd Seddik, Abdulrahman Elbedewy", + "Affiliations": "nan", + "Abstract": "This paper presents a novel method for abstractive summarization of Arabic text using a large-language model (LLM), GPT-3.5 Turbo. We introduce AGS, the first publicly available dataset consisting of 142,000 pairs of articles and summaries with an average compression ratio of 70% written in Modern Standard Arabic (MSA) generated by an LLM. We proposed a baseline model that achieves high scores on ROUGEL, similarity score, and compression ratio, while capturing the main points and details of the original articles. We fine-tuned mT5 on our dataset reaching a compression ratio of 62%, and similarity score of 82.65%. We release the AGS dataset to the research community, hoping to advance the field of Arabic natural language processing (NLP) and facilitate the development of effective abstractive summarization systems for Arabic text.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ai_society_translated.json b/datasets/ai_society_translated.json new file mode 100644 index 0000000..36f9822 --- /dev/null +++ b/datasets/ai_society_translated.json @@ -0,0 +1,36 @@ +{ + "Name": "AI Society Translated", + "Subsets": [], + "HF Link": "https://hf.co/datasets/camel-ai/ai_society_translated", + "Link": "https://hf.co/datasets/camel-ai/ai_society_translated", + "License": "CC BY-NC 4.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "The original AI Society dataset is in English and is composed of 25K conversations between two gpt-3.5-turbo agents. The dataset is obtained by running role-playing for a combination of 50 user roles and 50 assistant roles with each combination running over 10 tasks.", + "Volume": "25,000", + "Unit": "documents", + "Ethical Risks": "High", + "Provider": "Camel AI", + "Derived From": "AI Society", + "Paper Title": "CAMEL: Communicative Agents for \u201cMind\u201d Exploration of Large Language Model Society", + "Paper Link": "https://arxiv.org/pdf/2303.17760", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "instruction tuning", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "The rapid advancement of chat-based language models has led to remarkable progress in complex task-solving. However, their success heavily relies on human input to guide the conversation, which can be challenging and time-consuming. This paper explores the potential of building scalable techniques to facilitate autonomous cooperation among communicative agents, and provides insight into their \u201ccognitive\u201d processes. To address the challenges of achieving autonomous cooperation, we propose a novel communicative agent framework named roleplaying . Our approach involves using inception prompting to guide chat agents toward task completion while maintaining consistency with human intentions. We showcase how role-playing can be used to generate conversational data for studying the behaviors and capabilities of a society of agents, providing a valuable resource for investigating conversational language models. In particular, we conduct comprehensive studies on instruction-following cooperation in multi-agent settings. Our contributions include introducing a novel communicative agent framework, offering a scalable approach for studying the cooperative behaviors and capabilities of multi-agent systems, and open-sourcing our library to support research on communicative agents and beyond: https://github.com/camel-ai/camel.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ajdir_corpora.json b/datasets/ajdir_corpora.json new file mode 100644 index 0000000..e47bf2b --- /dev/null +++ b/datasets/ajdir_corpora.json @@ -0,0 +1,36 @@ +{ + "Name": "Ajdir Corpora", + "Subsets": [], + "HF Link": "nan", + "Link": "http://aracorpus.e3rab.com/argistestsrv.nmsu.edu/AraCorpus/", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is a raw text from Arabic daily newspapers collected over a year between 2004 and 2005. Each file is compiled as cleaned raw text from documents that are separated by two blank lines. ", + "Volume": "28", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ajgt.json b/datasets/ajgt.json new file mode 100644 index 0000000..edbd24f --- /dev/null +++ b/datasets/ajgt.json @@ -0,0 +1,36 @@ +{ + "Name": "AJGT", + "Subsets": [], + "HF Link": "https://hf.co/datasets/komari6/ajgt_twitter_ar", + "Link": "https://github.com/komari6/Arabic-twitter-corpus-AJGT", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.", + "Volume": "1,800", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "Arabic Tweets Sentimental Analysis Using Machine Learning", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-319-60042-0_66", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "IEA/AIE", + "Citations": "53.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems", + "Authors": "K. Alomari,H. M. Elsherif,K. Shaalan", + "Affiliations": ",,", + "Abstract": "The continuous rapid growth of electronic Arabic contents in social media channels and in Twitter particularly poses an opportunity for opinion mining research. Nevertheless, it is hindered by either the lack of sentimental analysis resources or Arabic language text analysis challenges. This study introduces an Arabic Jordanian twitter corpus where Tweets are annotated as either positive or negative. It investigates different supervised machine learning sentiment analysis approaches when applied to Arabic user\u2019s social media of general subjects that are found in either Modern Standard Arabic (MSA) or Jordanian dialect. Experiments are conducted to evaluate the use of different weight schemes, stemming and N-grams terms techniques and scenarios. The experimental results provide the best scenario for each classifier and indicate that SVM classifier using term frequency\u2013inverse document frequency (TF-IDF) weighting scheme with stemming through Bigrams feature outperforms the Naive Bayesian classifier best scenario performance results. Furthermore, this study results outperformed other results from comparable related work.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/akec.json b/datasets/akec.json new file mode 100644 index 0000000..d01c641 --- /dev/null +++ b/datasets/akec.json @@ -0,0 +1,36 @@ +{ + "Name": "AKEC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AKEC", + "Link": "https://github.com/ailab-uniud/akec", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The corpus consists in 160 arabic documents and their keyphrases.", + "Volume": "160", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "University of Udine,University of Sheffield", + "Derived From": "Contains the following corpus Arabic Newspapers Corpus, Corpus of Contemporary Arabic,Essex Arabic Summaries Corpus,Open Source Arabic Corpora", + "Paper Title": "Towards building a standard dataset for Arabic keyphrase extraction evaluation", + "Paper Link": "https://ieeexplore.ieee.org/document/7875927", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "keyphrase extraction", + "Venue Title": "IALP", + "Citations": "2.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Asian Language Processing", + "Authors": "Muhammad Helmy,Marco Basaldella,Eddy Maddalena,S. Mizzaro,Gianluca Demartini", + "Affiliations": ",,,,", + "Abstract": "Keyphrases are short phrases that best represent a document content. They can be useful in a variety of applications, including document summarization and retrieval models. In this paper, we introduce the first dataset of keyphrases for an Arabic document collection, obtained by means of crowdsourcing. We experimentally evaluate different crowdsourced answer aggregation strategies and validate their performances against expert annotations to evaluate the quality of our dataset. We report about our experimental results, the dataset features, some lessons learned, and ideas for future work.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/al-hayat_arabic_corpus.json b/datasets/al-hayat_arabic_corpus.json new file mode 100644 index 0000000..05d33e2 --- /dev/null +++ b/datasets/al-hayat_arabic_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Al-Hayat Arabic Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0030/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "The corpus was developed in the course of a research project at the University of Essex, in collaboration with the Open University. The corpus contains Al-Hayat newspaper articles with value added for Language Engineering and Information Retrieval applications development purposes. The data have been distributed into 7 subject-specific databases, thus following the Al-Hayat subject tags: General, Car, Computer, News, Economics, Science, and Sport. Mark-up, numbers, special characters and punctuation have been removed. The size of the total file is 268 MB. The dataset contains 18,639,264 distinct tokens in 42,591 articles, organised in 7 domains", + "Volume": "42,591", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "720.00\u20ac", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/alc__arabic_learner_corpus.json b/datasets/alc__arabic_learner_corpus.json new file mode 100644 index 0000000..d1e8b50 --- /dev/null +++ b/datasets/alc__arabic_learner_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "ALC: Arabic Learner Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015S10", + "License": "custom", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "comprises a\r\ncollection of texts written by learners of Arabic in\r\nSaudi Arabia", + "Volume": "1", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Leeds University", + "Derived From": "nan", + "Paper Title": "Arabic Learner Corpus v1:\r\nA New Resource for\r\nArabic Language Research\r", + "Paper Link": "https://eprints.whiterose.ac.uk/75470/22/AtwellVer2.13.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "25 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "other", + "Citations": "12.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Ayg Alfaifi,E. Atwell", + "Affiliations": ",", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/aljazeera-dialectal_speech.json b/datasets/aljazeera-dialectal_speech.json new file mode 100644 index 0000000..9ffdba9 --- /dev/null +++ b/datasets/aljazeera-dialectal_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Aljazeera-dialectal speech", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/aljazeera_dialectal_speech", + "Link": "https://alt.qcri.org/resources/aljazeeraspeechcorpus/", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "utterance-level dialect labels for 57 hours of high-quality audio from Al Jazeera consisting of four major varieties of DA: Egyptian, Levantine, Gulf, and North African.", + "Volume": "57", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Crowdsource a little to label a lot:\r\nLabeling a Speech Corpus of Dialectal Arabic", + "Paper Link": "https://www.isca-archive.org/interspeech_2015/wray15_interspeech.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "INTERSPEECH", + "Citations": "23.0", + "Venue Type": "conference", + "Venue Name": "Conference of the International Speech Communication Association", + "Authors": "Samantha Wray,Ahmed Ali", + "Affiliations": ",", + "Abstract": "Arabic is a language with great dialectal variety, with Modern Standard Arabic (MSA) being the only standardized dialect. Spoken Arabic is characterized by frequent code-switching between MSA and Dialectal Arabic (DA). DA varieties are typically differentiated by region, but despite their wide-spread usage, they are under-resourced and lack viable corpora and tools necessary for speech recognition and natural language processing. Existing DA speech corpora are limited in scope, consisting of mainly telephone conversations and scripted speech. In this paper we describe our efforts for using crowdsourcing to create a labeled multi-dialectal speech corpus. We obtained utterance-level dialect labels for 57 hours of high-quality audio from Al Jazeera consisting of four major varieties of DA: Egyptian, Levantine, Gulf, and North African. Using speaker linking to identify utterances spoken by the same speaker, and measures of label accuracy likelihood based on annotator behavior, we automatically labeled an additional 94 hours. The complete corpus contains 850 hours with approximately 18% DA speech.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/aljazeera_deleted_comments.json b/datasets/aljazeera_deleted_comments.json new file mode 100644 index 0000000..2fb7693 --- /dev/null +++ b/datasets/aljazeera_deleted_comments.json @@ -0,0 +1,36 @@ +{ + "Name": "Aljazeera Deleted Comments", + "Subsets": [], + "HF Link": "nan", + "Link": "https://alt.qcri.org/people/hmubarak/public_html/offensive/", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "offensive and obsene language dataset", + "Volume": "33,100", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Abusive Language Detection on Arabic Social Media\r", + "Paper Link": "https://aclanthology.org/W17-3008.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "hate speech detection, abusive language detection", + "Venue Title": "ALW", + "Citations": "148.0", + "Venue Type": "workshop", + "Venue Name": "Abusive Language Online", + "Authors": "Hamdy Mubarak,Kareem Darwish,Walid Magdy", + "Affiliations": ",,The University of Edinburgh", + "Abstract": "In this paper, we present our work on detecting abusive language on Arabic social media. We extract a list of obscene words and hashtags using common patterns used in offensive and rude communications. We also classify Twitter users according to whether they use any of these words or not in their tweets. We expand the list of obscene words using this classification, and we report results on a newly created dataset of classified Arabic tweets (obscene, offensive, and clean). We make this dataset freely available for research, in addition to the list of obscene words and hashtags. We are also publicly releasing a large corpus of classified user comments that were deleted from a popular Arabic news site due to violations the site\u2019s rules and guidelines.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/alr__arabic_laptop_reviews_dataset.json b/datasets/alr__arabic_laptop_reviews_dataset.json new file mode 100644 index 0000000..21a8389 --- /dev/null +++ b/datasets/alr__arabic_laptop_reviews_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "ALR: Arabic Laptop Reviews dataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/bashartalafha/Arabic-Laptop-Reviews-ALR-Dataset", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arabic Laptops Reviews (ALR) dataset focuses on laptops reviews written in Arabic", + "Volume": "1,753", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Jordan University of Science and Technology, Al-Balqa' Applied UniversityAl-Buraimi University College, Texas A&M University-San Antonio", + "Derived From": "nan", + "Paper Title": "Aspect-Based Sentiment Analysis of Arabic Laptop Reviews", + "Paper Link": "https://www.researchgate.net/publication/329557366_Aspect-Based_Sentiment_Analysis_of_Arabic_Laptop_Reviews", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "ACIT", + "Citations": "1.0", + "Venue Type": "conference", + "Venue Name": "international Arab Conference on Information Technology", + "Authors": "Mahmoud Al-Ayyoub, Amal Gigieh, Areej Al-Qwaqenah, Mohammed N. Al-Kabi, Bashar Talafhah, Izzat Alsmadi", + "Affiliations": "Jordan University of Science and Technology, Al-Balqa' Applied UniversityAl-Buraimi University College, Texas A&M University-San Antonio", + "Abstract": "Sentiment Analysis (SA) is one of the hottest research areas in Natural Language Processing (NLP) with vast commercial as well as academic applications. One of the most interesting versions of SA is called Aspect-Based SA (ABSA). Currently, most of the researchers focus on English text. Other languages such as Arabic have received less attention. To the best of our knowledge, only few papers have addressed ABSA of Arabic reviews and they have all been applied on only three datasets. In this work, we demonstrate our efforts to build the Arabic Laptops Reviews (ALR) dataset, which focuses on laptops reviews written in Arabic. To make it easy to use, the ALR dataset is prepared according to the annotation scheme of SemEval16-Task5. The annotation scheme considers two problems: aspect category prediction and sentiment polarity label prediction. It also comes with an evaluation procedure that extracts n-grams\u2019 features and employs a Support Vector Machine (SVM) classifier in order to allow researchers to gauge and compare the performance of their systems. The evaluation results show that there is a lot of room for improvements in the performance of the SVM classifier for the aspect category prediction problem. As for the sentiment polarity label prediction, SVM\u2019s accuracy is actually high.", + "Added By": "Wafaa Mohammed" +} \ No newline at end of file diff --git a/datasets/alriyadh-newspaper-covid-dataset.json b/datasets/alriyadh-newspaper-covid-dataset.json new file mode 100644 index 0000000..d4fe7d4 --- /dev/null +++ b/datasets/alriyadh-newspaper-covid-dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "AlRiyadh-Newspaper-Covid-Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AlRiyadh_Newspaper_Covid", + "Link": "https://github.com/alioh/AlRiyadh-Newspaper-Covid-Dataset", + "License": "CC BY 3.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "It is a dataset of Arabic newspapers articles addressing COVID-19 related events. The data origin is Alriyadh newspaper. It contains all news articles until 1 February 2021", + "Volume": "24,084", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic analysis (covid19)", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Emad A Alghamdi" +} \ No newline at end of file diff --git a/datasets/alue.json b/datasets/alue.json new file mode 100644 index 0000000..c2a25cb --- /dev/null +++ b/datasets/alue.json @@ -0,0 +1,36 @@ +{ + "Name": "ALUE", + "Subsets": [], + "HF Link": "nan", + "Link": "https://www.alue.org/tasks", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "8 carefully selected and previously\r\npublished tasks", + "Volume": "nan", + "Unit": "nan", + "Ethical Risks": "Medium", + "Provider": "Mawdoo3", + "Derived From": "OSACT4, SemEval-2018, IDAT, XNLI MADAR, NSURL-2019, IDAT", + "Paper Title": "ALUE: Arabic Language Understanding Evaluation\r", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.18.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "irony detection, dialect identification, semantic question similarity, offensive language detection, emotion detection, sentiment intensity regression, entailment", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Haitham Seelawi,Ibraheem Tuffaha,Mahmoud Gzawi,Wael Farhan,Bashar Talafha,Riham Badawi,Zyad Sober,Oday Al-Dweik,Abed Alhakim Freihat,Hussein T. Al-Natsheh", + "Affiliations": ",,,,,,,,,", + "Abstract": "The emergence of Multi-task learning (MTL)models in recent years has helped push thestate of the art in Natural Language Un-derstanding (NLU). We strongly believe thatmany NLU problems in Arabic are especiallypoised to reap the benefits of such models. Tothis end we propose the Arabic Language Un-derstanding Evaluation Benchmark (ALUE),based on 8 carefully selected and previouslypublished tasks. For five of these, we providenew privately held evaluation datasets to en-sure the fairness and validity of our benchmark.We also provide a diagnostic dataset to helpresearchers probe the inner workings of theirmodels.Our initial experiments show thatMTL models outperform their singly trainedcounterparts on most tasks. But in order to en-tice participation from the wider community,we stick to publishing singly trained baselinesonly. Nonetheless, our analysis reveals thatthere is plenty of room for improvement inArabic NLU. We hope that ALUE will playa part in helping our community realize someof these improvements. Interested researchersare invited to submit their results to our online,and publicly accessible leaderboard.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/amara.json b/datasets/amara.json new file mode 100644 index 0000000..d811e83 --- /dev/null +++ b/datasets/amara.json @@ -0,0 +1,36 @@ +{ + "Name": "AMARA ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/qed_amara", + "Link": "https://alt.qcri.org/resources/qedcorpus/", + "License": "custom", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "multilingually aligned for 20 languages, i.e. 20 monolingual corpora and 190 parallel corpora", + "Volume": "154,301", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "The AMARA Corpus: Building Parallel Language Resources for the\r\nEducational Domain", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2014/pdf/877_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "59.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Ahmed Abdelali,Francisco Guzm\u00e1n,Hassan Sajjad,S. Vogel", + "Affiliations": ",,,", + "Abstract": "This paper presents the AMARA corpus of on-line educational content: a new parallel corpus of educational video subtitles, multilingually aligned for 20 languages, i.e. 20 monolingual corpora and 190 parallel corpora. This corpus includes both resource-rich languages such as English and Arabic, and resource-poor languages such as Hindi and Thai. In this paper, we describe the gathering, validation, and preprocessing of a large collection of parallel, community-generated subtitles. Furthermore, we describe the methodology used to prepare the data for Machine Translation tasks. Additionally, we provide a document-level, jointly aligned development and test sets for 14 language pairs, designed for tuning and testing Machine Translation systems. We provide baseline results for these tasks, and highlight some of the challenges we face when building machine translation systems for educational content.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/an-nahar_newspaper_text_corpus.json b/datasets/an-nahar_newspaper_text_corpus.json new file mode 100644 index 0000000..1557575 --- /dev/null +++ b/datasets/an-nahar_newspaper_text_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "An-Nahar Newspaper Text Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0027/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2001, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "The An-Nahar Lebanon Newspaper Text Corpus comprises articles in standard Arabic from 1995 to 2000 (6 years) stored as HTML files on CDRom media. Each year contains 45 000 articles and 24 million words.", + "Volume": "45,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "3,024.00\u20ac", + "Test Split": "No", + "Tasks": "language modeling, text generation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/anad__arabic_natural_audio_dataset.json b/datasets/anad__arabic_natural_audio_dataset.json new file mode 100644 index 0000000..a179a6b --- /dev/null +++ b/datasets/anad__arabic_natural_audio_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": " ANAD: Arabic Natural Audio Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ANAD", + "Link": "https://data.mendeley.com/datasets/xm232yxf7t/1", + "License": "CC BY 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "Eight videos of live calls between an anchor and a human outside the studio were downloaded from online Arabic talk shows. Each video was then divided into turns: callers and receivers. To label each video, 18 listeners were asked to listen to each video and select whether they perceive a happy, angry or surprised emotion. Silence, laughs and noisy chunks were removed. Every chunk was then automatically divided into 1 sec speech units forming our final corpus composed of 1384 records.", + "Volume": "1,384", + "Unit": "hours", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "emotion recognition", + "Venue Title": "nan", + "Citations": "14.0", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Samira klaylat, ziad Osman, Rached Zantout, Lama Hamandi", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/anercorp.json b/datasets/anercorp.json new file mode 100644 index 0000000..0c623f4 --- /dev/null +++ b/datasets/anercorp.json @@ -0,0 +1,36 @@ +{ + "Name": "ANERcorp", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/anercorp/", + "License": "CC BY-SA 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "collected from different resources ", + "Volume": "316", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi University", + "Derived From": "nan", + "Paper Title": "CAMeL Tools: An Open Source Python Toolkit for Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2020.lrec-1.868.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "named entity recognition", + "Venue Title": "LREC", + "Citations": "22.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Ossama Obeid,Nasser Zalmout,Salam Khalifa,Dima Taji,M. Oudah,Bashar Alhafni,Go Inoue,Fadhl Eryani,Alexander Erdmann,Nizar Habash", + "Affiliations": ",,New York University Abu Dhabi,,,,New York University;New York University Abu Dhabi,,,", + "Abstract": "We present CAMeL Tools, a collection of open-source tools for Arabic natural language processing in Python. CAMeL Tools currently provides utilities for pre-processing, morphological modeling, Dialect Identification, Named Entity Recognition and Sentiment Analysis. In this paper, we describe the design of CAMeL Tools and the functionalities it provides.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/anetac.json b/datasets/anetac.json new file mode 100644 index 0000000..6c83faf --- /dev/null +++ b/datasets/anetac.json @@ -0,0 +1,36 @@ +{ + "Name": "ANETAC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ANETAC", + "Link": "https://github.com/MohamedHadjAmeur/ANETAC", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "English-Arabic named entity transliteration and classification dataset", + "Volume": "79,924", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "USTHB University,University of Salford\r", + "Derived From": "nan", + "Paper Title": "Automatic Identification and Verification", + "Paper Link": "https://arxiv.org/pdf/1907.03110.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition,transliteration", + "Venue Title": "CLEF", + "Citations": "35.0", + "Venue Type": "conference", + "Venue Name": "Conference and Labs of the Evaluation Forum", + "Authors": "Alberto Barr\u00f3n-Cede\u00f1o,Tamer Elsayed,Preslav Nakov,Giovanni Da San Martino,Maram Hasanain,Reem Suwaileh,Fatima Haouari,Nikolay Babulkov,Bayan Hamdan,Alex Nikolov,Shaden Shaar,Zien Sheikh Ali", + "Affiliations": ",,,Qatar Computing Research Institute,,,,,,,,", + "Abstract": "We present an overview of the third edition of the CheckThat! Lab at CLEF 2020. The lab featured five tasks in two different languages: English and Arabic. The first four tasks compose the full pipeline of claim verification in social media: Task 1 on check-worthiness estimation, Task 2 on retrieving previously fact-checked claims, Task 3 on evidence retrieval, and Task 4 on claim verification. The lab is completed with Task 5 on check-worthiness estimation in political debates and speeches. A total of 67 teams registered to participate in the lab (up from 47 at CLEF 2019), and 23 of them actually submitted runs (compared to 14 at CLEF 2019). Most teams used deep neural networks based on BERT, LSTMs, or CNNs, and achieved sizable improvements over the baselines on all tasks. Here we describe the tasks setup, the evaluation results, and a summary of the approaches used by the participants, and we discuss some lessons learned. Last but not least, we release to the research community all datasets from the lab as well as the evaluation scripts, which should enable further research in the important tasks of check-worthiness estimation and automatic claim verification.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/annotated_shami_corpus.json b/datasets/annotated_shami_corpus.json new file mode 100644 index 0000000..f9448a0 --- /dev/null +++ b/datasets/annotated_shami_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Annotated Shami Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/christios/annotated-shami-corpus", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Subsection of the Lebanese portion of the Shami Corpus annotated for spelling standardization (CODA), morphological segmentation and tagging, and spontaneous orthography taxonomy tagging.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "Shami Corpus", + "Paper Title": "Orthography Standardization in Arabic Dialects", + "Paper Link": "https://dspace.cuni.cz/handle/20.500.11956/147949", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "part of speech tagging, morphological analysis, error class taxonomy tagging, CODA", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Christian Khairallah", + "Affiliations": "Charles University in Prague, Saarland University", + "Abstract": "Spontaneous orthography in Arabic dialects poses one of the biggest obstacles in the way of Dialectal Arabic NLP applications. As the Arab world enjoys a wide array of these widely spoken and recently written, non-standard, low-resource varieties, this thesis presents a detailed account of this relatively overlooked phenomenon. It sets out to show that continuously creating additional noise-free, manually standardized corpora of Dialectal Arabic does not free us from the shackles of non-standard (spontaneous) orthography. Because real-world data will most often come in a noisy format, it also investigates ways to ease the amount of noise in textual data. As a proof of concept, we restrict ourselves to one of the dialectal varieties, namely, Lebanese Arabic. It also strives to gain a better understanding of the nature of the noise and its distribution. All of this is done by leveraging various spelling correction and morphological tagging neural architectures in a multi-task setting, and by annotating a Lebanese Arabic corpus for spontaneous orthography standardization, and morphological segmentation and tagging, among other features. Additionally, a detailed taxonomy of spelling inconsistencies for Lebanese Arabic is presented and is used to tag the corpus. This constitutes the first attempt in Dialectal Arabic research to try and categorize spontaneous orthography in a detailed manner.", + "Added By": "Christian Khairallah" +} \ No newline at end of file diff --git a/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json b/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json new file mode 100644 index 0000000..5d1c736 --- /dev/null +++ b/datasets/annotated_tweet_corpus_in_arabizi,_french_and_english.json @@ -0,0 +1,36 @@ +{ + "Name": "Annotated tweet corpus in Arabizi, French and English", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0323/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2022, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "In total, 17,103 sequences were annotated from 585,163 tweets (196,374 in English, 254,748 in French and 134,041 in Arabizi), including the themes \u201cOthers\u201d and \u201cIncomprehensible\u201d. Among these sequences, 4,578 sequences having at least 20 tweets annotated with the 3 predefined themes (Hooliganism, Racism and Terrorism) were obtained, including 1,866 sequences with an opinion change. They are distributed as follows: 2,141 sequences in English (57,655 tweets), 1,942 sequences in French (48,854 tweets) and 495 sequences in Arabizi (21,216 tweets). A sub-corpus of 8,733 tweets (1,209 in English, 3,938 in French and 3,585 in Arabizi) annotated as \u201chateful\u201d, according to topic/opinion annotations and by selecting tweets that contained insults, is also provided. ", + "Volume": "134,041", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "ELDA", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Latn", + "Tokenized": "No", + "Host": "ELRA", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification, theme classification, sentiment analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ans_corpus___claim_verification.json b/datasets/ans_corpus___claim_verification.json new file mode 100644 index 0000000..3260bd6 --- /dev/null +++ b/datasets/ans_corpus___claim_verification.json @@ -0,0 +1,43 @@ +{ + "Name": "ANS CORPUS: claim verification", + "Subsets": [ + { + "Name": "nan", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "3,786", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/ANS_stance", + "Link": "https://github.com/latynt/ans", + "License": "Apache-2.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence).", + "Volume": "4,547", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Latynt", + "Derived From": "nan", + "Paper Title": "Stance Prediction and Claim Verification: An Arabic Perspective", + "Paper Link": "https://arxiv.org/pdf/2005.10410.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "stance detection, claim verification", + "Venue Title": "FEVER", + "Citations": "8.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Fact Extraction and Verification", + "Authors": "Jude Khouja", + "Affiliations": "nan", + "Abstract": "This work explores the application of textual entailment in news claim verification and stance prediction using a new corpus in Arabic. The publicly available corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence). We describe the methodology for creating the corpus and the annotation process. Using the introduced corpus, we also develop two machine learning baselines for two proposed tasks: claim verification and stance prediction. Our best model utilizes pretraining (BERT) and achieves 76.7 F1 on the stance prediction task and 64.3 F1 on the claim verification task. Our preliminary experiments shed some light on the limits of automatic claim verification that relies on claims text only. Results hint that while the linguistic features and world knowledge learned during pretraining are useful for stance prediction, such learned representations from pretraining are insufficient for verifying claims without access to context or evidence.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/antcorpus.json b/datasets/antcorpus.json new file mode 100644 index 0000000..5af100f --- /dev/null +++ b/datasets/antcorpus.json @@ -0,0 +1,36 @@ +{ + "Name": "ANTCORPUS ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/antcorpus", + "Link": "https://github.com/antcorpus/antcorpus.data", + "License": "custom", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "ANT Corpus, which is collected from RSS Feeds.", + "Volume": "6,005", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "ANT Corpus: An Arabic News Text Collection for Textual Classification", + "Paper Link": "https://ieeexplore.ieee.org/abstract/document/8308275/authors#authors", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "AICCSA", + "Citations": "17.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer Systems and Applications", + "Authors": "Amina Chouigui,Oussama Ben Khiroun,Bilel Elayeb", + "Affiliations": ",,", + "Abstract": "We propose in this paper a new online Arabic corpus of news articles, named ANT Corpus, which is collected from RSS Feeds. Each document represents an article structured in the standard XML TREC format. We use the ANT Corpus for Text Classification (TC) by applying the SVM and Naive Bayes (NB) classifiers to assign to each article its accurate predefined category. We study also in this work the contribution of terms weighting, stop-words removal and light stemming on Arabic TC. The experimental results prove that the text length affects considerably the TC accuracy and that titles words are not sufficiently significant to perform good classification rates. As a conclusion, the SVM method gives the best results of classification of both titles and texts parts.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/anti-social_behaviour_in_online_communication.json b/datasets/anti-social_behaviour_in_online_communication.json new file mode 100644 index 0000000..2129957 --- /dev/null +++ b/datasets/anti-social_behaviour_in_online_communication.json @@ -0,0 +1,36 @@ +{ + "Name": "Anti-Social Behaviour in Online Communication", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/offensive_language_arabic", + "Link": "https://onedrive.live.com/?authkey=!ACDXj_ZNcZPqzy0&id=6EF6951FBF8217F9!105&cid=6EF6951FBF8217F9", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a corpus of 15,050 labelled YouTube comments in Arabic", + "Volume": "15,050", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Limrick Uni", + "Derived From": "nan", + "Paper Title": "Detection of Anti-Social Behaviour\r\nin Online Communication in\r\nArabic", + "Paper Link": "https://ulir.ul.ie/bitstream/handle/10344/9946/Alakrot_2019_Detection.pdf?sequence=2", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "OneDrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "behaviour analysis", + "Venue Title": "ACLING", + "Citations": "33.0", + "Venue Type": "conference", + "Venue Name": "nternational Conference on AI in Computational Linguistics", + "Authors": "Azalden Alakrot,Liam Murray,Nikola S. Nikolov", + "Affiliations": ",,", + "Abstract": "Abstract Warning: this paper contains a range of words which may cause offence. In recent years, many studies target anti-social behaviour such as offensive language and cyberbullying in online communication. Typically, these studies collect data from various reachable sources, the majority of the datasets being in English. However, to the best of our knowledge, there is no dataset collected from the YouTube platform targeting Arabic text and overall there are only a few datasets of Arabic text, collected from other social platforms for the purpose of offensive language detection. Therefore, in this paper we contribute to this field by presenting a dataset of YouTube comments in Arabic, specifically designed to be used for the detection of offensive language in a machine learning scenario. Our dataset contains a range of offensive language and flaming in the form of YouTube comments. We document the labelling process we have conducted, taking into account the difference in the Arab dialects and the diversity of perception of offensive language throughout the Arab world. Furthermore, statistical analysis of the dataset is presented, in order to make it ready for use as a training dataset for predictive modeling.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/aoc-aldi.json b/datasets/aoc-aldi.json new file mode 100644 index 0000000..297cfdb --- /dev/null +++ b/datasets/aoc-aldi.json @@ -0,0 +1,36 @@ +{ + "Name": "AOC-ALDi", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AOC_ALDi", + "Link": "https://github.com/AMR-KELEG/ALDi/raw/master/data/AOC-ALDi.tar.gz", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "mixed", + "Domain": "commentary", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Comments to news articles with a continuous level of dialectness score between 0 and 1.", + "Volume": "127,835", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "University of Edinburgh", + "Derived From": "AOC (Arabic Online Commentary)", + "Paper Title": "ALDi: Quantifying the Arabic Level of Dialectness of Text", + "Paper Link": "https://aclanthology.org/2023.emnlp-main.655/", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "dialect identification, level of dialectness", + "Venue Title": "EMNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "Amr Keleg, Sharon Goldwater, Walid Magdy", + "Affiliations": "University of Edinburgh", + "Abstract": "Transcribed speech and user-generated text in Arabic typically contain a mixture of Modern Standard Arabic (MSA), the standardized language taught in schools, and Dialectal Arabic (DA), used in daily communications. To handle this variation, previous work in Arabic NLP has focused on Dialect Identification (DI) on the sentence or the token level. However, DI treats the task as binary, whereas we argue that Arabic speakers perceive a spectrum of dialectness, which we operationalize at the sentence level as the Arabic Level of Dialectness (ALDi), a continuous linguistic variable. We introduce the AOC-ALDi dataset (derived from the AOC dataset), containing 127,835 sentences (17% from news articles and 83% from user comments on those articles) which are manually labeled with their level of dialectness. We provide a detailed analysis of AOC-ALDi and show that a model trained on it can effectively identify levels of dialectness on a range of other corpora (including dialects and genres not included in AOC-ALDi), providing a more nuanced picture than traditional DI systems. Through case studies, we illustrate how ALDi can reveal Arabic speakers' stylistic choices in different situations, a useful property for sociolinguistic analyses.", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/aoc.json b/datasets/aoc.json new file mode 100644 index 0000000..9531f76 --- /dev/null +++ b/datasets/aoc.json @@ -0,0 +1,49 @@ +{ + "Name": "AOC", + "Subsets": [ + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "63,555", + "Unit": "sentences" + }, + { + "Name": "Dialectal", + "Dialect": "mixed", + "Volume": "44,618", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/annotated_aoc", + "Link": "https://github.com/sjeblee/AOC", + "License": "unknown", + "Year": 2011, + "Language": "ar", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a 52M-word monolingual dataset rich in dialectal content", + "Volume": "108,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Johns Hopkins University", + "Derived From": "nan", + "Paper Title": "The Arabic Online Commentary Dataset: an Annotated Dataset of Informal Arabic with High Dialectal Content", + "Paper Link": "https://aclanthology.org/P11-2007.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "ACL", + "Citations": "147.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Omar Zaidan,Chris Callison-Burch", + "Affiliations": ",", + "Abstract": "The written form of Arabic, Modern Standard Arabic (MSA), differs quite a bit from the spoken dialects of Arabic, which are the true \"native\" languages of Arabic speakers used in daily life. However, due to MSA's prevalence in written form, almost all Arabic datasets have predominantly MSA content. We present the Arabic Online Commentary Dataset, a 52M-word monolingual dataset rich in dialectal content, and we describe our long-term annotation effort to identify the dialect level (and dialect itself) in each sentence of the dataset. So far, we have labeled 108K sentences, 41% of which as having dialectal content. We also present experimental results on the task of automatic dialect identification, using the collected labels for training and evaluation.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/apcd.json b/datasets/apcd.json new file mode 100644 index 0000000..101d301 --- /dev/null +++ b/datasets/apcd.json @@ -0,0 +1,36 @@ +{ + "Name": "APCD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/APCD", + "Link": "https://hci-lab.github.io/LearningMetersPoems/", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "A dataset of Arabic poetry containing 1,831,770 along with there meters. ", + "Volume": "1,831,770", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Helwan University", + "Derived From": "nan", + "Paper Title": "Learning meters of Arabic and English poems with Recurrent Neural Networks: a step forward for language understanding and synthesis", + "Paper Link": "https://arxiv.org/pdf/1905.05700.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "meter classification", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Waleed A. Yousefa,Omar M. Ibrahime,Taha M. Madboulya, Moustafa A. Mahmoud", + "Affiliations": "Nile university, Nile university, Nile university", + "Abstract": "Recognizing a piece of writing as a poem or prose is\nusually easy for the majority of people; however, only specialists\ncan determine which meter a poem belongs to. In this paper, we\nbuild Recurrent Neural Network (RNN) models that can classify\npoems according to their meters from plain text. The input text\nis encoded at the character level and directly fed to the models\nwithout feature handcrafting. This is a step forward for machine\nunderstanding and synthesis of languages in general, and Arabic\nlanguage in particular. Among the 16 poem meters of Arabic and the 4 meters\nof English the networks were able to correctly classify poem\nwith an overall accuracy of 96.38% and 82.31% respectively.\nThe poem datasets used to conduct this research were massive,\nover 1.5 million of verses, and were crawled from different\nnontechnical sources, almost Arabic and English literature sites,\nand in different heterogeneous and unstructured formats. These\ndatasets are now made publicly available in clean, structured,\nand documented format for other future research.\nTo the best of the authors\u2019 knowledge, this research is the\nfirst to address classifying poem meters in a machine learning approach, in general, and in RNN featureless based approach, in\nparticular. In addition, the dataset is the first publicly available\ndataset ready for the purpose of future computational research.\nIndex Terms\u2014Poetry, Meters, Al-\u2019arud, Arabic, English, Recurrent Neural Networks, RNN, Deep Learning, Deep Neural Networks, DNN, Classification, Text Mining.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/apcd2.json b/datasets/apcd2.json new file mode 100644 index 0000000..49d96dc --- /dev/null +++ b/datasets/apcd2.json @@ -0,0 +1,36 @@ +{ + "Name": "APCD2", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/APCDv2", + "Link": "https://github.com/Gheith-Abandah/classify-arabic-poetry", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "1657 k verses of poems and prose to develop neural networks to classify and diacritize Arabic poetry", + "Volume": "1,831,770", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "The University of Jordan", + "Derived From": "APCD", + "Paper Title": "Classifying and diacritizing Arabic poems using deep recurrent neural networks", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S1319157820305784/pdfft?md5=07be922e052bf43933bdb7bea5189718&pid=1-s2.0-S1319157820305784-main.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "meter classification", + "Venue Title": "JKSU", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Journal of King Saud University - Computer and Information Sciences", + "Authors": "Gheith A. Abandah, Mohammed Z. Khedher, Mohammad R. Abdel-Majeed, Hamdi M Mansour, Salma F Hulliel, Lara M Bisharata", + "Affiliations": "School of Engineering, The University of Jordan,School of Engineering, The University of Jordan,School of Engineering, The University of Jordan,School of Arts, The University of Jordan, School of Engineering, The University of Jordan, School of Engineering, The University of Jordan", + "Abstract": "Poetry has a prominent history in Arabic literature. The classical Arabic poetry has 16 m that vary in rhythm and target purpose. Chanting a poem eloquently requires knowing the poem\u2019s meter and obtaining a diacritized version of its verses (letters inscribed with their short vowels); diacritics are often not inscribed in Arabic texts. This work proposes solutions to classify input Arabic text into the 16 poetry meters and prose. It also investigates the automatic diacritization of Arabic poetry. We adopt machine learning approach using a large dataset of 1657 k verses of poems and prose to develop neural networks to classify and diacritize Arabic poetry. We propose deep and narrow recurrent neural networks with bidirectional long short-term memory cells for solving these problems. The proposed model classifies the input text with an average accuracy of 97.27%, which is significantly higher than previous work. We also propose a solution that achieves an accuracy that approaches 100% when multiple verses of the same poem are available through predicting the class from the aggregate probabilities of the multiple verses. Diacritizing poetry is much harder than diacritizing prose due to the poet\u2019s meticulous selection of phrases and relaxation of some diacritization rules.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json b/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json new file mode 100644 index 0000000..bbf4967 --- /dev/null +++ b/datasets/apgc_v1_0__arabic_parallel_gender_corpus_v1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "APGC v1.0: Arabic Parallel Gender Corpus v1.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/arabic-parallel-gender-corpus/", + "License": "custom", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a corpus designed to support research on gender bias in natural language processing applications working on Arabic", + "Volume": "12,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "OpenSubtitles", + "Paper Title": "Automatic Gender Identification and Reinflection in Arabic\r", + "Paper Link": "https://aclanthology.org/W19-3822v2.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "gender identification, gender rewriting", + "Venue Title": "GeBNLP", + "Citations": "13.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Gender Bias in Natural Language Processing", + "Authors": "Nizar Habash,Houda Bouamor,Christine Chung", + "Affiliations": ",,", + "Abstract": "The impressive progress in many Natural Language Processing (NLP) applications has increased the awareness of some of the biases these NLP systems have with regards to gender identities. In this paper, we propose an approach to extend biased single-output genderblind NLP systems with gender-specific alternative reinflections. We focus on Arabic, a gender-marking morphologically rich language, in the context of machine translation (MT) from English, and for first-personsingular constructions only. Our contributions are the development of a system-independent gender-awareness wrapper, and the building of a corpus for training and evaluating firstperson-singular gender identification and reinflection in Arabic. Our results successfully demonstrate the viability of this approach with 8% relative increase in BLEU score for firstperson-singular feminine, and 5.3% comparable increase for first-person-singular masculine on top of a state-of-the-art gender-blind MT system on a held-out test set.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/apgc_v2_0__arabic_parallel_gender_corpus_v2_0.json b/datasets/apgc_v2_0__arabic_parallel_gender_corpus_v2_0.json new file mode 100644 index 0000000..7cc21ca --- /dev/null +++ b/datasets/apgc_v2_0__arabic_parallel_gender_corpus_v2_0.json @@ -0,0 +1,36 @@ +{ + "Name": "APGC v2.0: Arabic Parallel Gender Corpus v2.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/arabic-parallel-gender-corpus/", + "License": "custom", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The Arabic Parallel Gender Corpus v2.0 (APGC v2.0) is designed to support research on gender bias and personalization in natural language processing applications working on Arabic. It expands on Habash et al. (2019)\u2019s Arabic Parallel Gender Corpus (APGC v1.0) by adding 2nd person targets as well increasing the total number of sentences over 6.5 times, reaching over 590K words.", + "Volume": "80,326", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "NYU Abu Dhabi", + "Derived From": "APGC v1.0", + "Paper Title": "The Arabic Parallel Gender Corpus 2.0: Extensions and Analyses", + "Paper Link": "https://arxiv.org/pdf/2110.09216.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "gender identification,\ngender rewriting", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Bashar Alhafni and Nizar Habash and Houda Bouamor.", + "Affiliations": "New York University Abu Dhabi, Carnegie Mellon University in Qatar", + "Abstract": "Gender bias in natural language processing (NLP) applications, particularly machine translation, has been receiving increasing attention. Much of the research on this issue has focused on mitigating gender bias in English NLP models and systems. Addressing the problem in poorly resourced, and/or morphologically rich languages has lagged behind, largely due to the lack of datasets and resources. In this paper, we introduce a new corpus for gender identification and rewriting in contexts involving one or two target users (I and/or You) -- first and second grammatical persons with independent grammatical gender preferences. We focus on Arabic, a gender-marking morphologically rich language. The corpus has multiple parallel components: four combinations of 1st and 2nd person in feminine and masculine grammatical genders, as well as English, and English to Arabic machine translation output. This corpus expands on Habash et al. (2019)'s Arabic Parallel Gender Corpus (APGC v1.0) by adding second person targets as well as increasing the total number of sentences over 6.5 times, reaching over 590K words. Our new dataset will aid the research and development of gender identification, controlled text generation, and post-editing rewrite systems that could be used to personalize NLP applications and provide users with the correct outputs based on their grammatical gender preferences. We make the Arabic Parallel Gender Corpus (APGC v2.0) publicly available.", + "Added By": "Bashar Alhafni" +} \ No newline at end of file diff --git a/datasets/aqad__arabic_question-answer_dataset.json b/datasets/aqad__arabic_question-answer_dataset.json new file mode 100644 index 0000000..10518af --- /dev/null +++ b/datasets/aqad__arabic_question-answer_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "AQAD: Arabic Question-Answer dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AQAD", + "Link": "https://github.com/adelmeleka/AQAD", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "Arabic Questions & Answers dataset", + "Volume": "17,911", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Alexu", + "Derived From": "QA from wikipedia (based on SQuAD 2 articles)", + "Paper Title": "AQAD: 17,000+ Arabic Questions for Machine Comprehension of Text", + "Paper Link": "https://www.semanticscholar.org/paper/AQAD%3A-17%2C000%2B-Arabic-Questions-for-Machine-of-Text-Atef-Mattar/d633e0f0a9fdd24c5e3e697478bcc30fc23c8cc8", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "AICCSA", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer Systems and Applications", + "Authors": "Adel Atef,Bassam Mattar,Sandra Sherif,Eman Elrefai,Marwan Torki", + "Affiliations": ",,,,", + "Abstract": "Current Arabic Machine Reading for Question Answering datasets suffer from important shortcomings. The available datasets are either small-sized high-quality collections or large-sized low-quality datasets. To address the aforementioned problems we present our Arabic Question-Answer dataset (AQAD). AQAD is a new Arabic reading comprehension large-sized high-quality dataset consisting of 17,000+ questions and answers. To collect the AQAD dataset, we present a fully automated data collector. Our collector works on a set of Arabic Wikipedia articles for the extractive question answering task. The chosen articles match the articles used in the well-known Stanford Question Answering Dataset (SQuAD). We provide evaluation results on the AQAD dataset using two state-of-the-art models for machine-reading question answering problems. Namely, BERT and BIDAF models which result in 0.37 and 0.32 F-1 measure on AQAD dataset.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/aqmar.json b/datasets/aqmar.json new file mode 100644 index 0000000..b84d2d8 --- /dev/null +++ b/datasets/aqmar.json @@ -0,0 +1,36 @@ +{ + "Name": "AQMAR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AQMAR", + "Link": "https://www.cs.cmu.edu/~ark/ArabicNER/", + "License": "CC BY-SA 3.0", + "Year": 2012, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This is a 74,000-token corpus of 28 Arabic Wikipedia articles hand-annotated for named entities.", + "Volume": "74,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "CMU", + "Derived From": "nan", + "Paper Title": "Recall-Oriented Learning of Named Entities in Arabic Wikipedia", + "Paper Link": "https://aclanthology.org/E12-1017.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "EACL", + "Citations": "64.0", + "Venue Type": "conference", + "Venue Name": "European Chapter of the Association for Computational Linguistics", + "Authors": "B. Mohit,Nathan Schneider,Rishav Bhowmick,Kemal Oflazer,Noah A. Smith", + "Affiliations": ",,,,", + "Abstract": "We consider the problem of NER in Arabic Wikipedia, a semisupervised domain adaptation setting for which we have no labeled training data in the target domain. To facilitate evaluation, we obtain annotations for articles in four topical groups, allowing annotators to identify domain-specific entity types in addition to standard categories. Standard supervised learning on newswire text leads to poor target-domain recall. We train a sequence model and show that a simple modification to the online learner---a loss function encouraging it to \"arrogantly\" favor recall over precision---substantially improves recall and F1. We then adapt our model with self-training on unlabeled target-domain data; enforcing the same recall-oriented bias in the self-training stage yields marginal gains.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/aqqac.json b/datasets/aqqac.json new file mode 100644 index 0000000..53f4a35 --- /dev/null +++ b/datasets/aqqac.json @@ -0,0 +1,36 @@ +{ + "Name": "AQQAC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AQQAC", + "Link": "https://archive.researchdata.leeds.ac.uk/464/1/AAQQAC.XML", + "License": "CC BY 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "QQAC is a collection of approximately 2224 questions and answers about Al-Al-Quran. Each question and answer is annotated with the question ID, question word (particles), chapter number, verse number, question topic, question type, Al-Quran ontology concepts (Alqahtani & Atwell, 2018) and question source. The aim of this corpus is to provide a Question-Answering taxonomy for questions about Al-Quran.", + "Volume": "1,224", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "University of Leeds", + "Derived From": "nan", + "Paper Title": "Annotated Corpus of Arabic Al-Quran Question and Answer", + "Paper Link": "https://archive.researchdata.leeds.ac.uk/464/", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Alqahtani, Mohammad and Atwell", + "Affiliations": "nan", + "Abstract": "AQQAC is a collection of approximately 2224 questions and answers about Al-Al-Quran. Each question and answer is annotated with the question ID, question word (particles), chapter number, verse number, question topic, question type, Al-Quran ontology concepts (Alqahtani & Atwell, 2018) and question source. The aim of this corpus is to provide a Question-Answering taxonomy for questions about Al-Quran. Additionally, this corpus might be used as a data set for testing and evaluating Islamic IR systems. The text of Al-Quran questions and answers were extracted from trusted two islamic sources: (1000 Su'al Wa Jawab Fi ALKORAN) was compiled by the famous Islamic scholar Ashur (2001). This book contains 1000 questions and answers about Al-Quran written in the Arabic language. Islam \u2013 Al-Quran and Tafseer is a website about Al-Quran that includes a description and a translation of Al-Quran and the reciting rules, the \u201cTajweed\u201d. Additionally, this website has approximately 1224 questions and answers about Al-Quran in the Arabic language extracted from the Altabari Tafseer. Currently, this dataset contains 1224 annotated question-answers and the missing data that hasn\u2019t been shared is due to copyright concerns.\n\n", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/ar-asag.json b/datasets/ar-asag.json new file mode 100644 index 0000000..6480362 --- /dev/null +++ b/datasets/ar-asag.json @@ -0,0 +1,36 @@ +{ + "Name": "AR-ASAG", + "Subsets": [], + "HF Link": "nan", + "Link": "https://data.mendeley.com/datasets/dj95jh332j/1", + "License": "CC BY 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "reported evaluations relate to answers submitted for three different exams submitted to three classes of students.", + "Volume": "2,133", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Bouira University", + "Derived From": "nan", + "Paper Title": "AR-ASAG An ARabic Dataset for Automatic Short Answer Grading Evaluation \n", + "Paper Link": "https://aclanthology.org/2020.lrec-1.321.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "answer grading evaluation", + "Venue Title": "LREC", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Leila Ouahrani,Djamel Bennouar", + "Affiliations": ",", + "Abstract": "Automatic short answer grading is a significant problem in E-assessment. Several models have been proposed to deal with it. Evaluation and comparison of such solutions need the availability of Datasets with manual examples. In this paper, we introduce AR-ASAG, an Arabic Dataset for automatic short answer grading. The Dataset contains 2133 pairs of (Model Answer, Student Answer) in several versions (txt, xml, Moodle xml and .db). We explore then an unsupervised corpus based approach for automatic grading adapted to the Arabic Language. We use COALS (Correlated Occurrence Analogue to Lexical Semantic) algorithm to create semantic space for word distribution. The summation vector model is combined to term weighting and common words to achieve similarity between a teacher model answer and a student answer. The approach is particularly suitable for languages with scarce resources such as Arabic language where robust specific resources are not yet available. A set of experiments were conducted to analyze the effect of domain specificity, semantic space dimension and stemming techniques on the effectiveness of the grading model. The proposed approach gives promising results for Arabic language. The reported results may serve as baseline for future research work evaluation", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json b/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json new file mode 100644 index 0000000..5b584b3 --- /dev/null +++ b/datasets/ar-embiddings__arabic_word_embeddings_for_sentiment_analysis.json @@ -0,0 +1,36 @@ +{ + "Name": "ar-embiddings: Arabic Word Embeddings for Sentiment Analysis", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/iamaziz/ar-embeddings", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A large corpus for generating Arabic word embeddings from multiple sources such as news articles, consumer reviews, Quran text, and tweets. The embeddings are used to perform sentiment analysis in both Standard and Dialectal Arabic without relying on hand-crafted features. The embeddings are applied to several binary classifiers to detect subjectivity and sentiment in Arabic texts.", + "Volume": "190,000,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Pace University", + "Derived From": "Quran-text, Watan-2004, CNN-Arabic, BBC-Arabic, Consumer Reviews", + "Paper Title": "Word Embeddings for Arabic Sentiment Analysis", + "Paper Link": "https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7841054", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis, subjectivity analysis ", + "Venue Title": "Big Data", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "IEEE International Conference on Big Data (Big Data)", + "Authors": "A. Aziz Altowayan, Lixin Tao", + "Affiliations": "Pace University", + "Abstract": "This paper presents an approach to using word embeddings for Arabic sentiment analysis in both Standard and Dialectal Arabic. The authors compile a large corpus from various sources and train word vectors (embeddings) using the CBOW model of word2vec. These embeddings are used to train several binary classifiers to detect sentiment and subjectivity in Arabic tweets, consumer reviews, and news articles. The approach achieves competitive performance without relying on manually crafted features. The corpus and code are made publicly available for future research.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/ara-timebank.json b/datasets/ara-timebank.json new file mode 100644 index 0000000..382ab09 --- /dev/null +++ b/datasets/ara-timebank.json @@ -0,0 +1,36 @@ +{ + "Name": "ARA-TimeBank", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/nafaa5/Arabic-event-timex-gazetteers-", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "enriched Arabic corpus, called \u201cARA-TimeBank\u201d, for events, temporal expressions and temporal relations based on the new Arabic TimeML.", + "Volume": "1,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Enrichment of Arabic TimeML Corpus", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-030-63007-2_51", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "event detection", + "Venue Title": "ICCCI", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "international scientific conference for research in the field of Computational Collective Intelligence", + "Authors": "Nafaa Haffar,Emna Hkiri,M. Zrigui", + "Affiliations": ",,", + "Abstract": "Automatic temporal information extraction is an important task for many natural language processing systems. This task requires thorough knowledge of the ontological and grammatical characteristics of temporal information in the text as well as annotated linguistic resources of the temporal entities. Before creating the resources or developing the system, it is first necessary to define a structured schema which describes how to annotate temporal entities. In this paper, we present a revised version of Arabic TimeML, and we propose an enriched Arabic corpus, called \u201cARA-TimeBank\u201d, for events, temporal expressions and temporal relations based on the new Arabic TimeML. We describe our methodology which combines a pre-annotation phase with manuel validation and verification. ARA-TimeBank is the first corpus constructed for Arabic, which meets the needs of TimeML and addresses the limitations of existing Arabic TimeBank.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arab-acquis.json b/datasets/arab-acquis.json new file mode 100644 index 0000000..d2716c5 --- /dev/null +++ b/datasets/arab-acquis.json @@ -0,0 +1,36 @@ +{ + "Name": "Arab-Acquis", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/arabacquis/", + "License": "custom", + "Year": 2017, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "consists of over 12,000 sentences from the JRCAcquis (Acquis Communautaire) corpus ", + "Volume": "12,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "A Parallel Corpus for Evaluating Machine Translation between Arabic and European Languages", + "Paper Link": "https://aclanthology.org/E17-2038.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation ", + "Venue Title": "EACL", + "Citations": "9.0", + "Venue Type": "conference", + "Venue Name": "European Chapter of the Association for Computational Linguistics", + "Authors": "Nizar Habash,Nasser Zalmout,Dima Taji,Hieu Hoang,Maverick Alzate", + "Affiliations": ",,,,", + "Abstract": "We present Arab-Acquis, a large publicly available dataset for evaluating machine translation between 22 European languages and Arabic. Arab-Acquis consists of over 12,000 sentences from the JRC-Acquis (Acquis Communautaire) corpus translated twice by professional translators, once from English and once from French, and totaling over 600,000 words. The corpus follows previous data splits in the literature for tuning, development, and testing. We describe the corpus and how it was created. We also present the first benchmarking results on translating to and from Arabic for 22 European languages.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arab-andalusian_music_corpus.json b/datasets/arab-andalusian_music_corpus.json new file mode 100644 index 0000000..d445bf8 --- /dev/null +++ b/datasets/arab-andalusian_music_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Arab-Andalusian music corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://zenodo.org/record/1291776#.YqTFeHZBxD9", + "License": "CC BY-NC 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The following files are available for 164 concert recordings (overall playable time more than 125 hours):", + "Volume": "125", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech classification ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arab-esl.json b/datasets/arab-esl.json new file mode 100644 index 0000000..883ac1b --- /dev/null +++ b/datasets/arab-esl.json @@ -0,0 +1,36 @@ +{ + "Name": "Arab-ESL", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/emoji_sentiment_lexicon", + "Link": "https://github.com/ShathaHakami/Arabic-Emoji-Sentiment-Lexicon-Version-1.0/blob/main/Arabic_Emoji_Sentiment_Lexicon_Version_1.0.csv", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Emoji (the popular digital pictograms) are sometimes seen as a new kind of artificial and universally usable and consistent writing code. In spite of their assumed universality, there is some evidence that the sense of an emoji, specifically in regard to sentiment, may change from language to language and culture to culture. This paper investigates whether contextual emoji sentiment analysis is consistent across Arabic and European languages. To conduct this investigation, we, first, created the Arabic emoji sentiment lexicon (Arab-ESL). Then, we exploited an existing European emoji sentiment lexicon to compare the sentiment conveyed in each of the two families of language and culture (Arabic and European). The results show that the pairwise correlation between the two lexicons is consistent for emoji that represent, for instance, hearts, facial expressions, and body language. However, for a subset of emoji (those that represent objects, nature, symbols, and some human activities), there are large differences in the sentiment conveyed. More interestingly, an extremely high level of inconsistency has been shown with food emoji.", + "Volume": "1,034", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Jazan University, University of Birmingham", + "Derived From": "nan", + "Paper Title": "Arabic Emoji Sentiment Lexicon (Arab-ESL): A Comparison between Arabic and European Emoji Sentiment Lexicons", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.7/", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "WANLP", + "Citations": "1.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Shatha Ali A. Hakami, Robert Hendley, Phillip Smith", + "Affiliations": "Jazan University / University of Birmingham", + "Abstract": "Emoji (the popular digital pictograms) are sometimes seen as a new kind of artificial and universally usable and consistent writing code. In spite of their assumed universality, there is some evidence that the sense of an emoji, specifically in regard to sentiment, may change from language to language and culture to culture. This paper investigates whether contextual emoji sentiment analysis is consistent across Arabic and European languages. To conduct this investigation, we, first, created the Arabic emoji sentiment lexicon (Arab-ESL). Then, we exploited an existing European emoji sentiment lexicon to compare the sentiment conveyed in each of the two families of language and culture (Arabic and European). The results show that the pairwise correlation between the two lexicons is consistent for emoji that represent, for instance, hearts, facial expressions, and body language. However, for a subset of emoji (those that represent objects, nature, symbols, and some human activities), there are large differences in the sentiment conveyed. More interestingly, an extremely high level of inconsistency has been shown with food emoji.", + "Added By": "Shatha Ali A. Hakami" +} \ No newline at end of file diff --git a/datasets/arab_states_analogy_dataset_(asad).json b/datasets/arab_states_analogy_dataset_(asad).json new file mode 100644 index 0000000..7f635c5 --- /dev/null +++ b/datasets/arab_states_analogy_dataset_(asad).json @@ -0,0 +1,36 @@ +{ + "Name": "Arab States Analogy Dataset (ASAD)", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/ASAD", + "Link": "https://github.com/SaiedAlshahrani/performance-implications/tree/main/Word-Representation-Evals/ASAD", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "ASAD is a word analogy dataset created using 20 Arab States with their corresponding capital cities, nationalities, currencies, and on which continents they are located, consisting of four sets: country-capital set, country-currency set, country-nationality set, and country-continent set. Each set has 380 word analogies, and the total number of word analogies in the ASAD dataset is 1520. This dataset is used to evaluate Arabic Word Embedding Models (WEMs). ", + "Volume": "1,520", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Clarkson University", + "Derived From": "nan", + "Paper Title": "Performance Implications of Using Unrepresentative Corpora in Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.19.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "embedding", + "Venue Title": "ArabicNLP 2023", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The First Arabic Natural Language Processing Conference", + "Authors": "Saied Alshahrani, Norah Alshahrani, Soumyabrata Dey, Jeanna Matthews", + "Affiliations": "Clarkson University", + "Abstract": "Wikipedia articles are a widely used source of training data for Natural Language Processing (NLP) research, particularly as corpora for low-resource languages like Arabic. However, it is essential to understand the extent to which these corpora reflect the representative contributions of native speakers, especially when many entries in a given language are directly translated from other languages or automatically generated through automated mechanisms. In this paper, we study the performance implications of using inorganic corpora that are not representative of native speakers and are generated through automated techniques such as bot generation or automated template-based translation. The case of the Arabic Wikipedia editions gives a unique case study of this since the Moroccan Arabic Wikipedia edition (ARY) is small but representative, the Egyptian Arabic Wikipedia edition (ARZ) is large but unrepresentative, and the Modern Standard Arabic Wikipedia edition (AR) is both large and more representative. We intrinsically evaluate the performance of two main NLP upstream tasks, namely word representation and language modeling, using word analogy evaluations and fill-mask evaluations using our two newly created datasets: Arab States Analogy Dataset (ASAD) and Masked Arab States Dataset (MASD). We demonstrate that for good NLP performance, we need both large and organic corpora; neither alone is sufficient. We show that producing large corpora through automated means can be a counter-productive, producing models that both perform worse and lack cultural richness and meaningful representation of the Arabic language and its native speakers.", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/arabceleb.json b/datasets/arabceleb.json new file mode 100644 index 0000000..8289fa1 --- /dev/null +++ b/datasets/arabceleb.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabCeleb", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/CeLuigi/ArabCeleb", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling", + "Description": "ArabCeleb is an audio dataset collected in the wild that specifically focuses on arabic language. The proposed dataset contains 1930 utterances from 100 celebrities taken from video on YouTube.com. The dataset might be used for several speaker recognition tasks: identification, verification, gender recognition as well as multimodal recognition tasks thus integrating audio and video tracks.", + "Volume": "1,930", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "ArabCeleb: Speaker Recognition in Arabic", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition ", + "Venue Title": "AIxIA", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference of the Italian Association for Artificial Intelligence", + "Authors": "Bianco, Simone and Celona, Luigi and Khalifa, Intissar and Napoletano, Paolo and Petrovsky, Alexey and Piccoli, Flavio and Schettini, Raimondo and Shanin, Ivan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabench.json b/datasets/arabench.json new file mode 100644 index 0000000..ab58f5d --- /dev/null +++ b/datasets/arabench.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabench", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AraBench", + "Link": "https://alt.qcri.org/resources1/mt/arabench/", + "License": "Apache-2.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "an evaluation suite for dialectal Arabic to English machine translation", + "Volume": "947,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "contains data from APT, MDC, MADAR, QCA(QAraC, the bible)", + "Paper Title": "AraBench: Benchmarking Dialectal Arabic-English Machine Translation", + "Paper Link": "https://aclanthology.org/2020.coling-main.447.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "COLING", + "Citations": "1.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computational Linguistics", + "Authors": "Hassan Sajjad,Ahmed Abdelali,Nadir Durrani,Fahim Dalvi", + "Affiliations": ",,Qatar Computing Research Institute,", + "Abstract": "Low-resource machine translation suffers from the scarcity of training data and the unavailability of standard evaluation sets. While a number of research efforts target the former, the unavailability of evaluation benchmarks remain a major hindrance in tracking the progress in low-resource machine translation. In this paper, we introduce AraBench, an evaluation suite for dialectal Arabic to English machine translation. Compared to Modern Standard Arabic, Arabic dialects are challenging due to their spoken nature, non-standard orthography, and a large variation in dialectness. To this end, we pool together already available Dialectal Arabic-English resources and additionally build novel test sets. AraBench offers 4 coarse, 15 fine-grained and 25 city-level dialect categories, belonging to diverse genres, such as media, chat, religion and travel with varying level of dialectness. We report strong baselines using several training settings: fine-tuning, back-translation and data augmentation. The evaluation suite opens a wide range of research frontiers to push efforts in low-resource machine translation, particularly Arabic dialect translation. The evaluation suite and the dialectal system are publicly available for research purposes.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabglossbert.json b/datasets/arabglossbert.json new file mode 100644 index 0000000..8c14dcd --- /dev/null +++ b/datasets/arabglossbert.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabGlossBERT", + "Subsets": [], + "HF Link": "nan", + "Link": "https://huggingface.co/spaces/SinaLab/ArabGlossBERT/tree/main", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "It is a fine-tuned BERT model designed for Arabic Word Sense Disambiguation (WSD). The model treats WSD as a sentence-pair binary classification task, leveraging a custom dataset of 167k labeled Arabic context-gloss pairs extracted from the Arabic Ontology. Each pair is labeled as True or False, with target words identified and annotated. The dataset was used to fine-tune three pre-trained Arabic BERT models, incorporating supervised signals to emphasize target words in context. The model achieved an accuracy of 84%, demonstrating strong performance despite the use of a large set of senses.", + "Volume": "167", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "ArabGlossBERT: Fine-Tuning BERT on Context-Gloss Pairs for WSD.", + "Paper Link": "https://arxiv.org/abs/2205.09685", + "Script": "Arab", + "Tokenized": "No", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, natural language inference", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Moustafa Al-Hajj, Mustafa Jarrar", + "Affiliations": "nan", + "Abstract": "Using pre-trained transformer models such as BERThas proven to be effective in many NLP tasks. This paper presents our work to finetune BERT models for Arabic Word Sense Disambiguation (WSD). We treated the WSD task as a sentence-pair binary classification task. First, we constructed a dataset of labeled Arabic context-gloss pairs (\u223c167k pairs) we extracted from the Arabic Ontology and the large lexicographic database available at Birzeit University. Each pair was labeled as True or False and target words in each context were identified and annotated. Second, we used this dataset for fine-tuning three pretrained Arabic BERT models. Third, we experimented the use of different supervised signals used to emphasize target words in context. Our experiments achieved promising results (accuracy of 84%) although we used a large set of senses in the experiment.", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/arabic-dataset-for-capt.json b/datasets/arabic-dataset-for-capt.json new file mode 100644 index 0000000..d0e0560 --- /dev/null +++ b/datasets/arabic-dataset-for-capt.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-Dataset-for-CAPT", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CAPT", + "Link": "https://github.com/bhalima/Arabic-Dataset-for-CAPT", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The dataset includes both \u201ccorrect\u201d and \u201cwrong\u201d non-artificial pronunciations. The pronunciations are from nine pupils aged from 5 to 8 years; each of them uttered a set of 16 sequences (words or group of words). The chosen words included some difficulties to learners such as the long vowels and the words written with more than one connected component. ", + "Volume": "143", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University Badji Mokhtar", + "Derived From": "nan", + "Paper Title": "A statistical-based decision for arabic pronunciation assessment", + "Paper Link": "https://link.springer.com/article/10.1007/s10772-014-9248-2", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "IJST", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "International Journal of Speech Technology", + "Authors": "Khaled Necibi, Halima Bahi ", + "Affiliations": "Khaled Necibi & Halima Bahi ", + "Abstract": "The aim of a computer assisted language learning (CALL) system is to improve the language skills of learners. Such systems often include, grammar and vocabulary components, while the pronunciation learning seems to be the hardest step in language learning process. Little attention has been paid to this aspect among the required ones in CALL systems. In pronunciation learning context, the learner would like to know if its pronunciation is good or bad. In the case where the pronunciation is bad, it will be suitable if some advices are given to him. The goal of this work is an early detection of pupils with reading difficulties and in the issue of decision whether their pronunciation is good or not is our particular interest. For this purpose, we consider the answer to this question as a classification problem and we use a statistical approach to make a decision; this approach allows us to pursue the investigation concerning the pronunciation of every phoneme in the word or in the sentence.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic-dialect_english_parallel_text.json b/datasets/arabic-dialect_english_parallel_text.json new file mode 100644 index 0000000..5d1a9bc --- /dev/null +++ b/datasets/arabic-dialect_english_parallel_text.json @@ -0,0 +1,43 @@ +{ + "Name": "Arabic-Dialect/English Parallel Text", + "Subsets": [ + { + "Name": "nan", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "380,000", + "Unit": "tokens" + } + ], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "it uses crowdsourcing to cheaply and quickly build LevantineEnglish and Egyptian-English parallel corpora, consisting of 1.1M words and 380k words, respectively.", + "Volume": "1,500,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Machine Translation of Arabic Dialects", + "Paper Link": "https://aclanthology.org/N12-1006.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2250 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "NAACL", + "Citations": "158.0", + "Venue Type": "conference", + "Venue Name": "North American Chapter of the Association for Computational Linguistics", + "Authors": "Rabih Zbib,Erika Malchiodi,J. Devlin,D. Stallard,S. Matsoukas,R. Schwartz,J. Makhoul,Omar Zaidan,Chris Callison-Burch", + "Affiliations": ",,,,,,,,", + "Abstract": "Arabic Dialects present many challenges for machine translation, not least of which is the lack of data resources. We use crowdsourcing to cheaply and quickly build Levantine-English and Egyptian-English parallel corpora, consisting of 1.1M words and 380k words, respectively. The dialectal sentences are selected from a large corpus of Arabic web text, and translated using Amazon's Mechanical Turk. We use this data to build Dialectal Arabic MT systems, and find that small amounts of dialectal data have a dramatic impact on translation quality. When translating Egyptian and Levantine test sets, our Dialectal Arabic MT system performs 6.3 and 7.0 BLEU points higher than a Modern Standard Arabic MT system trained on a 150M-word Arabic-English parallel corpus.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/arabic-english_named_entities_dataset.json b/datasets/arabic-english_named_entities_dataset.json new file mode 100644 index 0000000..1b6efbc --- /dev/null +++ b/datasets/arabic-english_named_entities_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-ENglish named entities dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Named_Entities_Lexicon", + "Link": "https://github.com/Hkiri-Emna/Named_Entities_Lexicon_Project", + "License": "CC BY 4.0", + "Year": 2017, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "Arabic-ENglish named entities dataset is created using DBpedia Linked datasets and parallel corpus. For annotating NE in monolingual English corpus we used Gate tool. Our approach is based on linked data entities by mapping them to Gate Gazetteers, and then constructing a type-oriented NE base covering person, Location and organization classes. The second task consists of the use of machine translation to translate these entities and then finally, generating our NE lexicon that encloses the list of Arabic entities that match to the English lists.", + "Volume": "48,753", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Monastir University - Tunisia, Umm Al-Qura University - Saudi Arabia", + "Derived From": "nan", + "Paper Title": "Constructing a Lexicon of Arabic-English Named Entity using SMT and Semantic Linked Data", + "Paper Link": "http://iajit.org/PDF//vol.%2014,%20no%206/10491.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, named entity recognition", + "Venue Title": "IAJIT", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "The International Arab Journal of Information Technology", + "Authors": "Emna Hkiri, Souheyl Mallat, Mounir Zrigui and Mourad Mars", + "Affiliations": "nan", + "Abstract": "Named Entity Recognition (NER) is the problem of locating and categorizing atomic entities in a given text. In this work, we used DBpedia Linked datasets and combined existing open source tools to generate from a parallel corpus a bilingual lexicon of Named Entities (NE). To annotate NE in the monolingual English corpus, we used linked data entities by mapping them to Gate Gazetteers. In order to translate entities identified by the gate tool from the English corpus, we used moses, a Statistical Machine Translation (SMT) system. The construction of the Arabic-English NE lexicon is based on the results of moses translation. Our method is fully automatic and aims to help Natural Language Processing (NLP) tasks such as, Machine Translation (MT) information retrieval, text mining and question answering. Our lexicon contains 48753 pairs of Arabic-English NE, it is freely available for use by other researchers. ", + "Added By": "Mourad Mars" +} \ No newline at end of file diff --git a/datasets/arabic-hebrew_ted_talks_parallel_corpus.json b/datasets/arabic-hebrew_ted_talks_parallel_corpus.json new file mode 100644 index 0000000..4ee943e --- /dev/null +++ b/datasets/arabic-hebrew_ted_talks_parallel_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-Hebrew TED Talks Parallel Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus", + "License": "unknown", + "Year": 2016, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "This dataset consists of 2023 TED talks with aligned Arabic and Hebrew subtitles. Sentences were rebuilt and aligned using English as a pivot to improve accuracy, offering a valuable resource for Arabic-Hebrew machine translation tasks.", + "Volume": "225,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Fondazione Bruno Kessler (FBK)", + "Derived From": "TED talks, WIT3 corpus", + "Paper Title": "An Arabic-Hebrew parallel corpus of TED talks", + "Paper Link": "https://arxiv.org/pdf/1610.00572.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation, language modeling, cross-lingual information retrieval, natural language inference", + "Venue Title": "IWSLT", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Workshop on Spoken Language Translation ", + "Authors": "Mauro Cettolo", + "Affiliations": "Fondazione Bruno Kessler (FBK), Trento, Italy", + "Abstract": "The paper describes the creation of an Arabic-Hebrew parallel corpus from TED talks, aligned using English as a pivot. The benchmark contains around 225,000 sentences and 3.5 million tokens in each language. It was prepared to assist machine translation tasks for Arabic-Hebrew and has been partitioned into train, development, and test sets similar to the IWSLT 2016 evaluation campaign.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arabic-multi-classification-dataset-amcd.json b/datasets/arabic-multi-classification-dataset-amcd.json new file mode 100644 index 0000000..2eb8650 --- /dev/null +++ b/datasets/arabic-multi-classification-dataset-amcd.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-Multi-Classification-Dataset-AMCD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AMCD", + "Link": "https://github.com/waelyafooz/Arabic-Multi-Classification-Dataset-AMCD", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "commentary", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is an Arabic dataset called Arabic Multi Classification Dataset version 0.1 (AMCD). AMCD can be used for texting mining and clustering and classification algorithm. It collected from YouTube Videos meta data and user comments. It can be used for topic modelling, text summarization, apply classification or clustering algorithms.", + "Volume": "8,046", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic modeling, summarization", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "Waelyafooz", + "Abstract": "nan", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/arabic-news.json b/datasets/arabic-news.json new file mode 100644 index 0000000..cb22792 --- /dev/null +++ b/datasets/arabic-news.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-News", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_News", + "Link": "https://github.com/motazsaad/Arabic-News", + "License": "unknown", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "Arabic News for language modeling collected from BBC Arabic EuroNews Aljazeera CNN Arabic RT Arabic", + "Volume": "713,134", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic-ocr.json b/datasets/arabic-ocr.json new file mode 100644 index 0000000..57131dd --- /dev/null +++ b/datasets/arabic-ocr.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-OCR", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/Kareem-Emad/arabic-ocr/blob/master/README.md", + "License": "custom", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "images", + "Collection Style": "other", + "Description": "This dataset contains multi-font Arabic printed text images used for text segmentation and OCR tasks. It involves different fonts and styles of Arabic script. The data is segmented using a hybrid method for character recognition, focusing on features like text skew correction, line, word, and character segmentation, and feature extraction for Arabic characters.", + "Volume": "83", + "Unit": "images", + "Ethical Risks": "Low", + "Provider": "King Saud University", + "Derived From": "OCR printed Arabic texts", + "Paper Title": "A New Hybrid Method for Arabic Multi-font Text Segmentation, and a Reference Corpus Construction", + "Paper Link": "https://doi.org/10.1016/j.jksuci.2018.07.003 ", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text segmentation, character recognition, optical character recognition", + "Venue Title": "JKSU", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Journal of King Saud University - Computer and Information Sciences", + "Authors": "Abdelhay Zoizou, Arsalane Zarghili, Ilham Chaker", + "Affiliations": "King Saud University", + "Abstract": "This paper introduces a hybrid method for Arabic multi-font text segmentation and reference corpus construction. It focuses on segmenting Arabic text in multiple fonts for Optical Character Recognition (OCR), leveraging techniques such as image skew correction, line, word, and character segmentation. The paper also discusses feature extraction for character recognition, including identifying character dots and concavities.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arabic-openhermes-2_5.json b/datasets/arabic-openhermes-2_5.json new file mode 100644 index 0000000..eaf21b9 --- /dev/null +++ b/datasets/arabic-openhermes-2_5.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-OpenHermes-2.5", + "Subsets": [], + "HF Link": "https://hf.co/datasets/2A2I/Arabic-OpenHermes-2.5", + "Link": "https://hf.co/datasets/2A2I/Arabic-OpenHermes-2.5", + "License": "Apache-2.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "Arabic-OpenHermes-2.5 is a carefully curated dataset extracted / translated from the OpenHermes-2.5 collection provided by teknium.", + "Volume": "982,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "2A2I", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "instruction tuning", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic-poem-emotion.json b/datasets/arabic-poem-emotion.json new file mode 100644 index 0000000..fab0b18 --- /dev/null +++ b/datasets/arabic-poem-emotion.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-Poem-Emotion", + "Subsets": [], + "HF Link": "Arabic-Poem-Emotion", + "Link": "https://github.com/SakibShahriar95/Arabic-Poem-Emotion", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "A dataset containing over 9000 Arabic poems labeled by three emotion classes.", + "Volume": "9,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Guelph", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "poem emotion Classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/arabic-stories-corpus.json b/datasets/arabic-stories-corpus.json new file mode 100644 index 0000000..b186206 --- /dev/null +++ b/datasets/arabic-stories-corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic-Stories-Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_Stories_Corpus", + "Link": "https://github.com/motazsaad/Arabic-Stories-Corpus", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "Arabic Stories Corpus collected from mawdoo3", + "Volume": "146", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "story generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_-_egyptian_comparable_wikipedia_corpus.json b/datasets/arabic_-_egyptian_comparable_wikipedia_corpus.json new file mode 100644 index 0000000..3d062c5 --- /dev/null +++ b/datasets/arabic_-_egyptian_comparable_wikipedia_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic - Egyptian comparable Wikipedia corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Comparable_Wikipedia", + "Link": "https://www.kaggle.com/datasets/mksaad/arb-egy-cmp-corpus", + "License": "CC BY-SA 4.0", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "The dataset is composed of a set of text documents in both Arabic (Modern Standard) and Egyptian dialect aligned at document level. comparable documents share the same document ID.", + "Volume": "nan", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "WikiDocsAligner: An Off-the-Shelf Wikipedia Documents Alignment Tool", + "Paper Link": "https://ieeexplore.ieee.org/document/8038320", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification, text generation, language modeling", + "Venue Title": "PICICT", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Palestinian International Conference on Information and Communication Technology", + "Authors": "Motaz Saad, Basem O. Alijla", + "Affiliations": "Islamic University of Gaza", + "Abstract": "Wikipedia encyclopedia is an attractive source for comparable corpora in many languages. Most researchers develop their own script to perform document alignment task, which requires efforts and time. In this paper, we present WikiDocsAligner, an off-the-shelf Wikipedia Articles alignment handy tool. The implementation of WikiDocsAligner does not require the researchers to import/export of interlanguage links databases. The user just need to download Wikipedia dumps (interlanguage links and articles), then provide them to the tool, which performs the alignment. This software can be used easily to align Wikipedia documents in any language pair. Finally, we use WikiDocsAligner to align comparable documents from Arabic Wikipedia and Egyptian Wikipedia. So we shed the light on Wikipedia as a source of Arabic dialects language resources. The produced resources is interesting and useful as the demand on Arabic/dialects language resources increased in the last decade.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_100k_reviews.json b/datasets/arabic_100k_reviews.json new file mode 100644 index 0000000..f34967a --- /dev/null +++ b/datasets/arabic_100k_reviews.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic 100k Reviews", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/arabic_100k_reviews", + "Link": "https://www.kaggle.com/datasets/abedkhooli/arabic-100k-reviews", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "other", + "Description": "Few Arabic datasets are available for classification comparison and other NLP tasks. This dataset is mainly a compilation of several available datasets and a sampling of 100k rows (99999 to be exact).", + "Volume": "99,999", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Abed Khooli", + "Derived From": "The hotels and book reviews are a subset of [HARD](HARD: https://github.com/elnagara/HARD-Arabic-Dataset ) and BRAD. The rest were selected from hadyelsahar with a little over 100 airlines reviews collected manually. ", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Afrah Altamimi" +} \ No newline at end of file diff --git a/datasets/arabic_ala_lc__romanization.json b/datasets/arabic_ala_lc__romanization.json new file mode 100644 index 0000000..2ee71ff --- /dev/null +++ b/datasets/arabic_ala_lc__romanization.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic ALA LC Romanization", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ALA_LC_Romanization", + "Link": "https://github.com/CAMeL-Lab/Arabic_ALA-LC_Romanization", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "parallel Arabic and Romanized bibliographic entries", + "Volume": "107,439", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "Automatic Romanization of Arabic Bibliographic Records\r", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.23.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "text romanization", + "Venue Title": "WANLP", + "Citations": "0.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Fadhl Eryani,Nizar Habash", + "Affiliations": ",", + "Abstract": "International library standards require cataloguers to tediously input Romanization of their catalogue records for the benefit of library users without specific language expertise. In this paper, we present the first reported results on the task of automatic Romanization of undiacritized Arabic bibliographic entries. This complex task requires the modeling of Arabic phonology, morphology, and even semantics. We collected a 2.5M word corpus of parallel Arabic and Romanized bibliographic entries, and benchmarked a number of models that vary in terms of complexity and resource dependence. Our best system reaches 89.3% exact word Romanization on a blind test set. We make our data and code publicly available.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_analogy.json b/datasets/arabic_analogy.json new file mode 100644 index 0000000..39b35ca --- /dev/null +++ b/datasets/arabic_analogy.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic analogy ", + "Subsets": [], + "HF Link": "nan", + "Link": " http://computersystemsartists.net/arabic-morphological-analogies.zip", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": " The corpus consists of pairs of Arabic words that explore different morphological constructs and relationships. The analogy pairs represent semantic relations like masculine-feminine forms, imperfect-perfect verb forms, possessive noun phrases, and verb-object constructions. The goal of the corpus is to explore how word vectors in different semantic spaces handle these Arabic linguistic features.", + "Volume": "240", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "Common words in Arabic Wikipedia and other corpora", + "Paper Title": "Arabic Word Analogies and Semantics of Simple Phrases", + "Paper Link": "https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8374386&tag=1", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "embeddings", + "Venue Title": "NLP-KE", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "IEEE International Conference on Natural Language Processing", + "Authors": "Stephen Taylor and Tom\u00e1\u0161 Brychc\u00edn", + "Affiliations": "Faculty of Applied Sciences, University of West Bohemia, Czech Republic", + "Abstract": "The paper explores how vector representations of words in semantic spaces handle the morphological and syntactic features of Arabic. It introduces an Arabic Word Analogy Corpus focused on word pairs representing common Arabic morphological constructs, such as verb-object suffixes, definite articles, and noun-adjective agreements. The corpus is used to evaluate the performance of ten different semantic spaces, including word2vec, GloVe, and fastText.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arabic_billion_words.json b/datasets/arabic_billion_words.json new file mode 100644 index 0000000..ae3138c --- /dev/null +++ b/datasets/arabic_billion_words.json @@ -0,0 +1,36 @@ +{ + "Name": "arabic billion words", + "Subsets": [], + "HF Link": "https://hf.co/datasets/abuelkhair-corpus/arabic_billion_words", + "Link": "http://www.abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "includes more than five million newspaper articles", + "Volume": "5,222,973", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "1.5 billion words Arabic Corpus\r", + "Paper Link": "https://arxiv.org/ftp/arxiv/papers/1611/1611.04033.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modelling", + "Venue Title": "ArXiv", + "Citations": "17.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "I. A. El-Khair", + "Affiliations": "nan", + "Abstract": "This study is an attempt to build a contemporary linguistic corpus for Arabic language. The corpus produced, is a text corpus includes more than five million newspaper articles. It contains over a billion and a half words in total, out of which, there is about three million unique words. The data were collected from newspaper articles in ten major news sources from eight Arabic countries, over a period of fourteen years. The corpus was encoded with two types of encoding, namely: UTF-8, and Windows CP-1256. Also it was marked with two mark-up languages, namely: SGML, and XML.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_broadcast_news_speech.json b/datasets/arabic_broadcast_news_speech.json new file mode 100644 index 0000000..f727685 --- /dev/null +++ b/datasets/arabic_broadcast_news_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Broadcast News Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006S46", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The recordings were captured from a dedicated satellite receiver and stored as 16-bit PCM, 16-kHz, single-channel, in NIST SPHERE format. The duration of each recording is either 60 minutes or 120 minutes, depending on the VOA broadcast schedule. The date (YYYYMMDD), start-time, and end-time (HHMM EST) for each recording are indicated in the file names. The sample data are not compressed.", + "Volume": "1", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "700.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_broadcast_news_transcripts.json b/datasets/arabic_broadcast_news_transcripts.json new file mode 100644 index 0000000..3191760 --- /dev/null +++ b/datasets/arabic_broadcast_news_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Broadcast News Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T20", + "License": "LDC User Agreement for Non-Members", + "Year": 2001, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Arabic Broadcast News Transcripts was developed by the Linguistic Data Consortium (LDC) and consists of 10 hours of transcribed speech from Voice of America satellite radio news broadcasts in Arabic recorded by LDC between June 2000 and January 2001. ", + "Volume": "10", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "400.00 $", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_business_corpora.json b/datasets/arabic_business_corpora.json new file mode 100644 index 0000000..06e51a0 --- /dev/null +++ b/datasets/arabic_business_corpora.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic business corpora", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/buisness_corpora", + "Link": "https://sourceforge.net/projects/arabic-business-copora/", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "The main corpora contains 1200 articles. The articles have been tagged using Stanford Arabic Part of Speech Tagger. ", + "Volume": "1,200", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "part of speech tagging, topic classification ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_common_voice.json b/datasets/arabic_common_voice.json new file mode 100644 index 0000000..653e59c --- /dev/null +++ b/datasets/arabic_common_voice.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Common Voice", + "Subsets": [], + "HF Link": "https://hf.co/datasets/legacy-datasets/common_voice", + "Link": "https://commonvoice.mozilla.org/ar/datasets", + "License": "CC0\n", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "an open source, multi-language dataset of voices that anyone can use to train speech-enabled applications.", + "Volume": "85", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "mozilla", + "Derived From": "nan", + "Paper Title": "Common Voice: A Massively-Multilingual Speech Corpus\r", + "Paper Link": "https://arxiv.org/pdf/1912.06670.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mozilla", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition", + "Venue Title": "LREC", + "Citations": "131.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Rosana Ardila,Megan Branson,Kelly Davis,Michael Henretty,Michael Kohler,Josh Meyer,Reuben Morais,Lindsay Saunders,Francis M. Tyers,Gregor Weber", + "Affiliations": ",,,,,,,,,", + "Abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla\u2019s DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 \u00b1 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_cts_levantine_fisher_training_data_set_3,_speech.json b/datasets/arabic_cts_levantine_fisher_training_data_set_3,_speech.json new file mode 100644 index 0000000..864df1f --- /dev/null +++ b/datasets/arabic_cts_levantine_fisher_training_data_set_3,_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic CTS Levantine Fisher Training Data Set 3, Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005S07", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "This corpus contains 322 audio files of Arabic telephone conversations in 2-channel SPHERE ulaw format, sampled at 8 kHz.", + "Volume": "50", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_cts_levantine_fisher_training_data_set_3,_transcripts.json b/datasets/arabic_cts_levantine_fisher_training_data_set_3,_transcripts.json new file mode 100644 index 0000000..4b99521 --- /dev/null +++ b/datasets/arabic_cts_levantine_fisher_training_data_set_3,_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic CTS Levantine Fisher Training Data Set 3, Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005T03", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This corpus contains Arabic transcripts for 322 telephone conversations in UTF-8 format and associated documentation including a word list with frequency of occurences. The list shows all the occurences of words in their pronunciation spellings mapped to their corresponding canonical forms, as well as their raw frequency (the amount of times they appear in the corpus) and source document frequency (the number of documents in which they appear in the corpus).", + "Volume": "322", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "750.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_dev2.json b/datasets/arabic_dev2.json new file mode 100644 index 0000000..db88f7f --- /dev/null +++ b/datasets/arabic_dev2.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic DEv2", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArabicDEv2", + "Link": "https://zenodo.org/record/4560653#.YqSGWXZBxD9", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "The dataset is a translation of a subset (139/467) of the queries in DBpedia Entity v2 (https://github.com/iai-group/DBpedia-Entity) to Modern Standard Arabic. We used the \u201cstopped\u201d version of DBpedia Entity v2 (queries-v2_stopped.txt).", + "Volume": "139", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Bloomberg L.P.", + "Derived From": "DEv2", + "Paper Title": "SERAG: Semantic Entity Retrieval from Arabic knowledge Graphs", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.24.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "entity retrieval from knowledge graphs ", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Workshop on Arabic Natural Language Processing", + "Authors": "Saher Esmeir", + "Affiliations": "Bloomberg L.P.", + "Abstract": "Knowledge graphs (KGs) are widely used to\nstore and access information about entities and\ntheir relationships. Given a query, the task of\nentity retrieval from a KG aims at presenting\na ranked list of entities relevant to the query.\nLately, an increasing number of models for entity retrieval have shown a significant improvement over traditional methods. These models,\nhowever, were developed for English KGs. In\nthis work, we build on one such system, named\nKEWER, to propose SERAG (Semantic Entity Retrieval from Arabic knowledge Graphs).\nLike KEWER, SERAG uses random walks to\ngenerate entity embeddings. DBpedia-Entity\nv2 is considered the standard test collection\nfor entity retrieval. We discuss the challenges\nof using it for non-English languages in general and Arabic in particular. We provide an\nArabic version of this standard collection, and\nuse it to evaluate SERAG. SERAG is shown\nto significantly outperform the popular BM25\nmodel thanks to its multi-hop reasoning.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_dialects_dataset.json b/datasets/arabic_dialects_dataset.json new file mode 100644 index 0000000..0364d08 --- /dev/null +++ b/datasets/arabic_dialects_dataset.json @@ -0,0 +1,67 @@ +{ + "Name": "Arabic Dialects Dataset", + "Subsets": [ + { + "Name": "GLF", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "2,546", + "Unit": "sentences" + }, + { + "Name": "LAV", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "2,463", + "Unit": "sentences" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "3,731", + "Unit": "sentences" + }, + { + "Name": "NOR", + "Dialect": "ar-NOR: (Arabic (North Africa))", + "Volume": "3,693", + "Unit": "sentences" + }, + { + "Name": "EGY", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "4,061", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/Arabic_Dialects_Dataset", + "Link": "https://www.lancaster.ac.uk/staff/elhaj/corpora.html", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Dataset of Arabic dialects for GULF, EGYPT, LEVANT, TONESIAN Arabic dialects in addition to MSA.", + "Volume": "16,494", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Lancaster University", + "Derived From": "OAC", + "Paper Title": "Arabic Dialect Identification in the Context of Bivalency and Code-Switching\r", + "Paper Link": "https://aclanthology.org/L18-1573.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "LREC", + "Citations": "17.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Mahmoud El-Haj,Paul Rayson,Mariam Aboelezz", + "Affiliations": "Lancaster University,Lancaster University,", + "Abstract": "In this paper we use a novel approach towards Arabic dialect identification using language bivalency and written code-switching. Bivalency between languages or dialects is where a word or element is treated by language users as having a fundamentally similar semantic content in more than one language or dialect. Arabic dialect identification in writing is a difficult task even for humans due to the fact that words are used interchangeably between dialects. The task of automatically identifying dialect is harder and classifiers trained using only n-grams will perform poorly when tested on unseen data. Such approaches require significant amounts of annotated training data which is costly and time consuming to produce. Currently available Arabic dialect datasets do not exceed a few hundred thousand sentences, thus we need to extract features other than word and character n-grams. In our work we present experimental results from automatically identifying dialects from the four main Arabic dialect regions (Egypt, North Africa, Gulf and Levant) in addition to Standard Arabic. We extend previous work by incorporating additional grammatical and stylistic features and define a subtractive bivalency profiling approach to address issues of bivalent words across the examined Arabic dialects. The results show that our new methods classification accuracy can reach more than 76% and score well (66%) when tested on completely unseen data.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_dictionary_of_inflected_words.json b/datasets/arabic_dictionary_of_inflected_words.json new file mode 100644 index 0000000..0e3110c --- /dev/null +++ b/datasets/arabic_dictionary_of_inflected_words.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic dictionary of inflected words", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-L0098/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The Arabic dictionary of inflected words consists of a list of 6 million inflected forms, fully vowelized, generated in compliance with the grammatical rules of Arabic and tagged with grammatical information which includes POS and grammatical features, including number, gender, case, definiteness, tense, mood and compatibility with clitic agglutination.", + "Volume": "6,000,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "4,500.00\u20ac", + "Test Split": "No", + "Tasks": "lexicon analysis, part of speech tagging", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_document_classification_dataset.json b/datasets/arabic_document_classification_dataset.json new file mode 100644 index 0000000..2522630 --- /dev/null +++ b/datasets/arabic_document_classification_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic document classification dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Document_Classification", + "Link": "https://diab.edublogs.org/dataset-for-arabic-document-classification/", + "License": "unknown", + "Year": 2014, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The dataset contains nine major disciplines: Art, Literature, Religion, Politics, Law, Economy, Sport, Health, and Technology.", + "Volume": "2,700", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "On the Impact of Dataset Characteristics on Arabic Document Classification", + "Paper Link": "https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.800.5666&rep=rep1&type=pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "IJCA", + "Citations": "11.0", + "Venue Type": "journal", + "Venue Name": "International Journal of Computer Applications", + "Authors": "Diab Abuaiadah,Jihad El Sana,Walid Abusalah", + "Affiliations": ",,", + "Abstract": "paper describes the impact of dataset characteristics on the results of Arabic document classification algorithms using TF-IDF representations. The experiments compared different stemmers, different categories and different training set sizes, and found that different dataset characteristics produced widely differing results, in one case attaining a remarkable 99% recall (accuracy). The use of a standard dataset would eliminate this variability and enable researchers to gain comparable knowledge from the published results.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/arabic_empathetic_dialogues.json b/datasets/arabic_empathetic_dialogues.json new file mode 100644 index 0000000..9188efe --- /dev/null +++ b/datasets/arabic_empathetic_dialogues.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Empathetic Dialogues", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/arabic_empathetic_conversations", + "Link": "https://github.com/aub-mind/Arabic-Empathetic-Chatbot", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "38K samples of open-domain utterances and empathetic responses in Modern Standard Arabic (MSA)", + "Volume": "36,629", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "AUB", + "Derived From": "nan", + "Paper Title": "Empathy-driven Arabic Conversational Chatbot\r", + "Paper Link": "https://aclanthology.org/2020.wanlp-1.6.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialogue generation", + "Venue Title": "WANLP", + "Citations": "3.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Tarek Naous,Christian Hokayem,Hazem M. Hajj", + "Affiliations": ",,", + "Abstract": "Conversational models have witnessed a significant research interest in the last few years with the advancements in sequence generation models. A challenging aspect in developing human-like conversational models is enabling the sense of empathy in bots, making them infer emotions from the person they are interacting with. By learning to develop empathy, chatbot models are able to provide human-like, empathetic responses, thus making the human-machine interaction close to human-human interaction. Recent advances in English use complex encoder-decoder language models that require large amounts of empathetic conversational data. However, research has not produced empathetic bots for Arabic. Furthermore, there is a lack of Arabic conversational data labeled with empathy. To address these challenges, we create an Arabic conversational dataset that comprises empathetic responses. However, the dataset is not large enough to develop very complex encoder-decoder models. To address the limitation of data scale, we propose a special encoder-decoder composed of a Long Short-Term Memory (LSTM) Sequence-to-Sequence (Seq2Seq) with Attention. The experiments showed success of our proposed empathy-driven Arabic chatbot in generating empathetic responses with a perplexity of 38.6, an empathy score of 3.7, and a fluency score of 3.92.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_english_parallel_news_part_1.json b/datasets/arabic_english_parallel_news_part_1.json new file mode 100644 index 0000000..455cbdc --- /dev/null +++ b/datasets/arabic_english_parallel_news_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic English Parallel News Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2004T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "LDC collected the data in this corpus via Ummah Press Service from January 2001 to September 2004. It totals 8,439 story pairs, 68,685 sentence pairs. The corpus is aligned at sentence level. All data files are SGML documents.", + "Volume": "2,000,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_flood_twitter_dataset.json b/datasets/arabic_flood_twitter_dataset.json new file mode 100644 index 0000000..061bc6d --- /dev/null +++ b/datasets/arabic_flood_twitter_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Flood Twitter Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/twitter_flood_detection", + "Link": "https://github.com/alaa-a-a/Arabic-Twitter-Corpus-for-Flood-Detection", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "It includes 4,037 human-labelled Arabic Twitter messages for four high-risk flood events that occurred in 2018", + "Volume": "4,037", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Crisis Detection from Arabic Tweets\r", + "Paper Link": "https://aclanthology.org/W19-5609.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "crisis detection", + "Venue Title": "WACL", + "Citations": "5.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Arabic Corpus Linguistics", + "Authors": "Alaa Alharbi,Mark G. Lee", + "Affiliations": "Taibah University;University of Birmingham,", + "Abstract": "Social media (SM) platforms such as Twitter offer a rich source of real-time information about crises from which useful information can be extracted to support situational\r\nawareness. The task of automatically identifying SM messages related to a specific event\r\nposes many challenges, including processing\r\nlarge volumes of short, noisy data in real time.\r\nThis paper explored the problem of extracting crisis-related messages from Arabic Twitter data. We focused on high-risk floods as\r\nthey are one of the main hazards in the Middle East. In this work, we presented a goldstandard Arabic Twitter corpus for four highrisk floods that occurred in 2018. Using the\r\nannotated dataset, we investigated the performance of different classical machine learning\r\n(ML) and deep neural network (DNN) classifiers. The results showed that deep learning is\r\npromising in identifying flood-related posts", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_gigaword.json b/datasets/arabic_gigaword.json new file mode 100644 index 0000000..04d061f --- /dev/null +++ b/datasets/arabic_gigaword.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Gigaword", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2003T12", + "License": "LDC User Agreement for Non-Members", + "Year": 2003, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "There are 319 files, totalling approximately 1.1GB in compressed form (4348 MB uncompressed, and 391619 Kwords). ", + "Volume": "1,256,719", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval,language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_gigaword_fifth_edition.json b/datasets/arabic_gigaword_fifth_edition.json new file mode 100644 index 0000000..292aea3 --- /dev/null +++ b/datasets/arabic_gigaword_fifth_edition.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Gigaword Fifth Edition", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011T11", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "All text data are presented in SGML form, using a very simple, minimal markup structure. For every opening tag (DOC, HEADLINE, DATELINE, TEXT, P), there is a corresponding closing tag -- always. The attribute values in the DOC tag are always presented within double-quotes the id= attribute of DOC consists of the 7-letter source abbreviation (in CAPS), an underscore character, an 8-digit date string representing the date of the story (YYYYMMDD), a period, and a 4-digit sequence number starting at 0001 for each date (e.g. XIN_ARB_200101.0001) in this way, every DOC in the corpus is uniquely identifiable by the id string.", + "Volume": "nan", + "Unit": "nan", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "6,000.00 $", + "Test Split": "No", + "Tasks": "language modeling,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_gigaword_fourth_edition.json b/datasets/arabic_gigaword_fourth_edition.json new file mode 100644 index 0000000..12a41e6 --- /dev/null +++ b/datasets/arabic_gigaword_fourth_edition.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Gigaword Fourth Edition", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T30", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The table below shows data quantity by source under the following categories: data source (Source); the number of files per source (#Files); compressed file size (Gzip-MB); uncompressed file size (Totl-MB); the number of space-separated words tokens in the text (K-words); and the number of documents per source (#DOCs).", + "Volume": "2,716,995", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "5,000.00 $", + "Test Split": "No", + "Tasks": "language modeling,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_gigaword_second_edition.json b/datasets/arabic_gigaword_second_edition.json new file mode 100644 index 0000000..cbf9423 --- /dev/null +++ b/datasets/arabic_gigaword_second_edition.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Gigaword Second Edition", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T02", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The following table contains information for this corpus, broken down by source. The information includes source codes represented in the corpus as well as their codes from the first edition, the collection span and number of documents new to this edition, the number of documents total, and the K-words (thousands of words) for each source. Ummah Press is a new source included in the second edition and therefore has no first edition info.", + "Volume": "1,591,983", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval,language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_gigaword_third_edition.json b/datasets/arabic_gigaword_third_edition.json new file mode 100644 index 0000000..4850617 --- /dev/null +++ b/datasets/arabic_gigaword_third_edition.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Gigaword Third Edition", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007T40", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This release contains 547 files, totalling approximately 1.8GB in compressed form (6,673 MB uncompressed) and 1,994,735 K-words.", + "Volume": "1,994,735", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,000.00 $", + "Test Split": "No", + "Tasks": "language modeling,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_hate_speech_2022_shared_task.json b/datasets/arabic_hate_speech_2022_shared_task.json new file mode 100644 index 0000000..98624a8 --- /dev/null +++ b/datasets/arabic_hate_speech_2022_shared_task.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Hate Speech 2022 Shared Task", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_Hate_Speech", + "Link": "https://sites.google.com/view/arabichate2022/home", + "License": "custom", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "largest Arabic dataset for offensive, fine-grained hate speech, vulgar and violence content", + "Volume": "12,698", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Emojis as Anchors to Detect Arabic Offensive Language and Hate Speech", + "Paper Link": "https://arxiv.org/pdf/2201.06723.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "offensive language detection, hate speech detection", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "Hamdy Mubarak, Sabit Hassan , and Shammur Absar Chowdhury", + "Abstract": "We introduce a generic, language-independent method to collect a large percentage of offensive and hate\ntweets regardless of their topics or genres. We harness the extralinguistic information embedded in the\nemojis to collect a large number of offensive tweets. We apply the proposed method on Arabic tweets\nand compare it with English tweets \u2013 analysing key cultural differences. We observed a constant usage of\nthese emojis to represent offensiveness throughout different timespans on Twitter. We manually annotate\nand publicly release the largest Arabic dataset for offensive, fine-grained hate speech, vulgar and violence content. Furthermore, we benchmark the dataset for detecting offensiveness and hate speech using\ndifferent transformer architectures and perform in-depth linguistic analysis. We evaluate our models on\nexternal datasets \u2013 a Twitter dataset collected using a completely different method, and a multi-platform\ndataset containing comments from Twitter, YouTube and Facebook, for assessing generalization capability.\nCompetitive results on these datasets suggest that the data collected using our method captures universal\ncharacteristics of offensive language. Our findings also highlight the common words used in offensive communications, common targets for hate speech, specific patterns in violence tweets; and pinpoint common\nclassification errors that can be attributed to limitations of NLP models. We observe that even state-ofthe-art transformer models may fail to take into account culture, background and context or understand\nnuances present in real-world data such as sarcasm.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_infectious_disease_ontology.json b/datasets/arabic_infectious_disease_ontology.json new file mode 100644 index 0000000..5c54d98 --- /dev/null +++ b/datasets/arabic_infectious_disease_ontology.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Infectious Disease Ontology", + "Subsets": [], + "HF Link": "nan", + "Link": "http://www.research.lancs.ac.uk/portal/en/datasets/arabic-infectious-disease-ontology(39dbef60-ae9b-4405-99c8-35a41e95a3e0).html", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "This file contains an Arabic Infectious Disease Ontology to include Non-Standard Terminology. The Arabic Infectious Disease Ontology is written in the Arabic language, and is the first ontology in Arabic specialising in the infectious disease domain. It contains 11 classes, 21 object properties, 11 datatype properties, and 215 individual concepts.", + "Volume": "215", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Lancaster University", + "Derived From": "nan", + "Paper Title": "Developing an Arabic Infectious Disease Ontology to Include Non-Standard Terminology", + "Paper Link": "https://eprints.lancs.ac.uk/id/eprint/142307/1/LREC_2020_Paper_Developing_an_Arabic_Infectious_Ontology_.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "infectious disease ontology", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Lama Alsudias, Paul Rayson", + "Affiliations": " Lancaster University; King Saud University", + "Abstract": "Building ontologies is a crucial part of the semantic web endeavour. In recent years, research interest has grown rapidly in supporting\nlanguages such as Arabic in NLP in general but there has been very little research on medical ontologies for Arabic. We present a new\nArabic ontology in the infectious disease domain to support various important applications including the monitoring of infectious disease\nspread via social media. This ontology meaningfully integrates the scientific vocabularies of infectious diseases with their informal\nequivalents. We use ontology learning strategies with manual checking to build the ontology. We applied three statistical methods for\nterm extraction from selected Arabic infectious diseases articles: TF-IDF, C-value, and YAKE. We also conducted a study, by consulting\naround 100 individuals, to discover the informal terms related to infectious diseases in Arabic. In future work, we will automatically\nextract the relations for infectious disease concepts but for now these are manually created. We report two complementary experiments\nto evaluate the ontology. First, a quantitative evaluation of the term extraction results and an additional qualitative evaluation by a\ndomain expert.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_keyphrase_dataset.json b/datasets/arabic_keyphrase_dataset.json new file mode 100644 index 0000000..f48d328 --- /dev/null +++ b/datasets/arabic_keyphrase_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Keyphrase dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Keyphrase_Extraction", + "Link": "https://github.com/logmani/ArabicDataset", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A dataset in Arabic language for automatic keyphrase extraction algorithms", + "Volume": "400", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "KFUPM", + "Derived From": "nan", + "Paper Title": "ARABIC DATASET FOR AUTOMATIC KEYPHRASE EXTRACTION", + "Paper Link": "https://airccj.org/CSCP/vol7/csit76321.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "keyphrase extraction", + "Venue Title": "CSIT", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer Science and Information Technologies", + "Authors": "Mohammed Al Logmani,H. Muhtaseb", + "Affiliations": ",", + "Abstract": "We propose a dataset in Arabic language for automatic keyphrase extraction algorithms. Our Arabic dataset contains 400 documents along with their keyphrases. The dataset covers eighteen different categories. An evaluation using a state-of-the-art algorithm demonstrates the accuracy of our dataset is similar to that of English datasets.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_morphological_dictionary.json b/datasets/arabic_morphological_dictionary.json new file mode 100644 index 0000000..820b625 --- /dev/null +++ b/datasets/arabic_morphological_dictionary.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Morphological Dictionary", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-L0088/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2012, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The Arabic Morphological Dictionary contains 4,912,749 entries, including: - 3,374,852 nouns, - 1,537,699 verbs, - 198 grammatical words.", + "Volume": "4,912,749", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "450.00\u20ac", + "Test Split": "No", + "Tasks": "morphological analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_named_entities.json b/datasets/arabic_named_entities.json new file mode 100644 index 0000000..7732edd --- /dev/null +++ b/datasets/arabic_named_entities.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Named Entities", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_Named_Entities", + "Link": "https://sourceforge.net/projects/arabicnes/", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "we have extracted\r\napproximately 45,000 Arabic NE", + "Volume": "45,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "An automatically built Named Entity lexicon for Arabic\r", + "Paper Link": "http://doras.dcu.ie/15979/1/An_automatically_built_Named_Entity_lexicon_for_Arabic.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "LREC", + "Citations": "41.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Mohammed Attia,Antonio Toral,L. Tounsi,M. Monachini,Josef van Genabith", + "Affiliations": ",University of Groningen,,,", + "Abstract": "We have successfully adapted and extended the automatic Multilingual, Interoperable Named Entity Lexicon approach to Arabic, using Arabic WordNet (AWN) and Arabic Wikipedia (AWK). First, we extract AWN\u2019s instantiable nouns and identify the corresponding categories and hyponym subcategories in AWK. Then, we exploit Wikipedia inter-lingual links to locate correspondences between articles in ten different languages in order to identify Named Entities (NEs). We apply keyword search on AWK abstracts to provide for Arabic articles that do not have a correspondence in any of the other languages. In addition, we perform a post-processing step to fetch further NEs from AWK not reachable through AWN. Finally, we investigate diacritization using matching with geonames databases, MADA-TOKAN tools and different heuristics for restoring vowel marks of Arabic NEs. Using this methodology, we have extracted approximately 45,000 Arabic NEs and built, to the best of our knowledge, the largest, most mature and well-structured Arabic NE lexical resource to date. We have stored and organised this lexicon following the Lexical Markup Framework (LMF) ISO standard. We conduct a quantitative and qualitative evaluation of the lexicon against a manually annotated gold standard and achieve precision scores from \n 95.83% (with 66.13% recall) to 99.31% (with 61.45% recall) according to different values of a threshold.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_named_entity_gazetteer.json b/datasets/arabic_named_entity_gazetteer.json new file mode 100644 index 0000000..e98af35 --- /dev/null +++ b/datasets/arabic_named_entity_gazetteer.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Named Entity Gazetteer", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/WikiFANEGazet", + "Link": "https://sourceforge.net/projects/arabic-named-entity-gazetteer/", + "License": "CC BY 3.0", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A gazetteer of entities curated from Wikipedia.", + "Volume": "68,355", + "Unit": "tokens", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Automatically Developing a Fine-grained Arabic Named Entity Corpus and Gazetteer by utilizing Wikipedia", + "Paper Link": "https://aclanthology.org/I13-1045/", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition, information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/arabic_natural_audio_dataset.json b/datasets/arabic_natural_audio_dataset.json new file mode 100644 index 0000000..efd8e1a --- /dev/null +++ b/datasets/arabic_natural_audio_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Natural Audio Dataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://data.mendeley.com/datasets/xm232yxf7t/1", + "License": "CC BY 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "This is the first Arabic Natural Audio Dataset (ANAD) developed to recognize 3 discrete emotions: Happy,angry, and surprised.", + "Volume": "1,384", + "Unit": "hours", + "Ethical Risks": "Medium", + "Provider": "Lebanese International University", + "Derived From": "nan", + "Paper Title": "Emotion recognition in Arabic speech", + "Paper Link": "https://link.springer.com/content/pdf/10.1007/s10470-018-1142-4.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech emotion recognition", + "Venue Title": "nan", + "Citations": "30.0", + "Venue Type": "journal", + "Venue Name": "Analog Integrated Circuits and Signal Processing", + "Authors": "Samira Klaylat, Ziad Osman, Lama Hamandi, Rached Zantout", + "Affiliations": "nan", + "Abstract": "Automatic emotion recognition from speech signals without linguistic cues has been an important emerging research area. Integrating emotions in human\u2013computer interaction is of great importance to effectively simulate real life scenarios. Research has been focusing on recognizing emotions from acted speech while little work was done on natural real life\nutterances. English, French, German and Chinese corpora were used for that purpose while no natural Arabic corpus was found to date. In this paper, emotion recognition in Arabic spoken data is studied for the first time. A realistic speech corpus from Arabic TV shows is collected. The videos are labeled by their perceived emotions; namely happy, angry or\nsurprised. Prosodic features are extracted and thirty-five classification methods are applied. Results are analyzed in this paper and conclusions and future recommendations are identified.", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/arabic_news_articles.json b/datasets/arabic_news_articles.json new file mode 100644 index 0000000..a079bb7 --- /dev/null +++ b/datasets/arabic_news_articles.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic news articles", + "Subsets": [], + "HF Link": "nan", + "Link": "https://webz.io/free-datasets/arabic-news-articles/", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "Arabic news articles dataset crawled from the Webz.io API Language category", + "Volume": "236,383", + "Unit": "documents", + "Ethical Risks": "High", + "Provider": "Webz.io API", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "financial Analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Webz.io API", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file diff --git a/datasets/arabic_news_articles_from_aljazeera_net.json b/datasets/arabic_news_articles_from_aljazeera_net.json new file mode 100644 index 0000000..3c58c7a --- /dev/null +++ b/datasets/arabic_news_articles_from_aljazeera_net.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic News articles from Aljazeera.net", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/news_articles_aljazeera", + "Link": "https://www.kaggle.com/datasets/arhouati/arabic-news-articles-from-aljazeeranet", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "Natural Language Process, or NLP, is one of the most studied machine learning field. Much progress has been made in recent years which has allowed this field to move into large scale use in several domains. Nowadays, NLP is widely used in social networks, in search engines, in translation tools, in chatbot assistants, and many others cases \u2026 However, the progress and results differ from language to another. The majority of machine learning models treat in priority English and abandon the other languages, in particular Arabic. The main reason for this is the lack of datasets. Hence all the interest of the current datasets gathering +5000 news articles in Arabic.", + "Volume": "5,870", + "Unit": "documents", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "summarization , embedding ,classification model to identify article domain,sentiment analysis, co-reference identification model, named entity recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "ABDELKADER RHOUATI ", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/arabic_news_dataset_about_hajj.json b/datasets/arabic_news_dataset_about_hajj.json new file mode 100644 index 0000000..9977e0d --- /dev/null +++ b/datasets/arabic_news_dataset_about_hajj.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic news dataset about Hajj ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Haneen84/Arabic_news/tree/main", + "Link": "https://hf.co/datasets/Haneen84/Arabic_news/tree/main", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "more than 2k articles about Hajj ", + "Volume": "2,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Arabic Pilgrim Services Dataset: Creating and Analysis", + "Paper Link": "https://ieeexplore.ieee.org/document/10085561", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, text generation, information retrieval", + "Venue Title": "2023 1st International Conference on Advanced Innovations in Smart Cities", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "2023 1st International Conference on Advanced Innovations in Smart Cities", + "Authors": "Al-Barhamtoshy, Hassanin & Himdi, Hanen & Alyahya, Mohamad. ", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Hanen Himdi" +} \ No newline at end of file diff --git a/datasets/arabic_news_translation_text_part_1.json b/datasets/arabic_news_translation_text_part_1.json new file mode 100644 index 0000000..1f8286a --- /dev/null +++ b/datasets/arabic_news_translation_text_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic News Translation Text Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2004T17", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "Here is a breakdown of the Arabic material by source:", + "Volume": "1,526", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "cross-lingual information retrieval,language teaching,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_news_tweets.json b/datasets/arabic_news_tweets.json new file mode 100644 index 0000000..1d78744 --- /dev/null +++ b/datasets/arabic_news_tweets.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic News Tweets", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_News_Tweets", + "Link": "https://data.mendeley.com/datasets/9dxgbgx86k/3", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This dataset is a relatively great size collection of Arabic news tweets that were collected from an official and verified users in Twitter. All news that is collected from the most popular and official users in Saudi Arabia belongs to Saudi Arabia news. All data that is gathered was retrieved using specific time period and collected all news in that time. To the best of our knowledge, this dataset is the first Arabic news data collection that does not specify by keywords and belongs to Saudi Arabia. This news dataset can be valuable for diverse tasks in NLP, such as text classification and automated verification system. The dataset has been categorized into 5 different news classes which are general news, regions news, sport news, economic news, and quality life news. In this data article, 89,179 original tweets have presented and fully labeled into related categories.", + "Volume": "89,179", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "Umm Al-Qura University, University of Technology Sydney", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Karali Sami, Thanoon Mohammed, Lin Chin-Teng", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/arabic_newswire_english_translation_collection.json b/datasets/arabic_newswire_english_translation_collection.json new file mode 100644 index 0000000..efd1186 --- /dev/null +++ b/datasets/arabic_newswire_english_translation_collection.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Newswire English Translation Collection", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T22", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "the 49 translations for this AFP subset are not included in this release, resulting in a total 1,682 translations for the 1,731 source stories.", + "Volume": "1,731", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "syntactic parsing", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_newswire_part_1.json b/datasets/arabic_newswire_part_1.json new file mode 100644 index 0000000..2fbd211 --- /dev/null +++ b/datasets/arabic_newswire_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Newswire Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2001T55", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The data is in 2,337 compressed (zipped) Arabic text data files. There are 209 Mb of compressed data (869 Mb uncompressed) with approximately 383,872 documents containing 76 million tokens over approximately 666,094 unique words. ", + "Volume": "383,872", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,200.00 $", + "Test Split": "No", + "Tasks": "information retrieval,language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_ontology.json b/datasets/arabic_ontology.json new file mode 100644 index 0000000..a507e66 --- /dev/null +++ b/datasets/arabic_ontology.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic ontology ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://ontology.birzeit.edu/about#download", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "An Arabic Wordnet with ontologically-clean content. Classification of the meanings of the Arabic terms, based on state-of-art science, rather than on speakers' na\u00efve knowledge. ", + "Volume": "30,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": " The Arabic Ontology - An Arabic Wordnet with Ontologically Clean Content.", + "Paper Link": "https://arxiv.org/abs/2205.09664", + "Script": "Arab", + "Tokenized": "No", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, topic classification, cross-lingual, information retrieval, speaker identification, transliteration", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mustafa Jarrar", + "Affiliations": "nan", + "Abstract": "We present a formal Arabic wordnet built on the basis of a carefully designed ontology hereby referred to as the Arabic Ontology. The ontology provides a formal representation of the concepts that the Arabic terms convey, and its content was built with ontological analysis in mind, and benchmarked to scientific advances and rigorous knowledge sources as much as this is possible, rather than to only speakers\u2019 beliefs as lexicons typically are. A comprehensive evaluation was conducted thereby demonstrating that the current version of the top-levels of the ontology can top the majority of the Arabic meanings. The ontology consists currently of about 1,300 well-investigated concepts in addition to 11,000 concepts that are partially validated. The ontology is accessible and searchable through a lexicographic search engine (http://ontology.birzeit.edu) that also includes about 150 Arabic-multilingual lexicons, and which are being mapped and enriched using the ontology. The ontology is fully mapped with Princeton WordNet, Wikidata, and other resources. ", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/arabic_osact4___offensive_language_detection.json b/datasets/arabic_osact4___offensive_language_detection.json new file mode 100644 index 0000000..9549ff2 --- /dev/null +++ b/datasets/arabic_osact4___offensive_language_detection.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic OSACT4 : Offensive Language Detection", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/OSACT4_hatespeech", + "Link": "https://github.com/motazsaad/arabic-hatespeech-data/blob/master/OSACT4/README.md", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "OSACT4 Shared Task on Offensive Language Detection", + "Volume": "8,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Overview of OSACT4 Arabic Offensive Language Detection Shared Task\r", + "Paper Link": "https://aclanthology.org/2020.osact-1.7.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "CodaLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "offensive language detection", + "Venue Title": "OSACT", + "Citations": "24.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Hamdy Mubarak,Kareem Darwish,Walid Magdy,Tamer Elsayed,H. Al-Khalifa", + "Affiliations": ",,The University of Edinburgh,,", + "Abstract": "This paper provides an overview of the offensive language detection shared task at the 4th workshop on Open-Source Arabic Corpora and Processing Tools (OSACT4). There were two subtasks, namely: Subtask A, involving the detection of offensive language, which contains unacceptable or vulgar content in addition to any kind of explicit or implicit insults or attacks against individuals or groups; and Subtask B, involving the detection of hate speech, which contains insults or threats targeting a group based on their nationality, ethnicity, race, gender, political or sport affiliation, religious belief, or other common characteristics. In total, 40 teams signed up to participate in Subtask A, and 14 of them submitted test runs. For Subtask B, 33 teams signed up to participate and 13 of them submitted runs. We present and analyze all submissions in this paper.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_osact5___arabic_hate_speech.json b/datasets/arabic_osact5___arabic_hate_speech.json new file mode 100644 index 0000000..0525ebe --- /dev/null +++ b/datasets/arabic_osact5___arabic_hate_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic OSACT5 : Arabic Hate Speech", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/osact5_hatespeech", + "Link": "https://codalab.lisn.upsaclay.fr/competitions/2324", + "License": "custom", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Fine-Grained Hate Speech Detection on Arabic Twitter", + "Volume": "10,157", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Emojis as Anchors to Detect Arabic Offensive Language and Hate Speech", + "Paper Link": "https://arxiv.org/pdf/2201.06723.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CodaLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "hate speech detection", + "Venue Title": "OSACT", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Hamdy Mubarak, Sabit Hassan , and Shammur Absar Chowdhury", + "Affiliations": "Qatar Computing Research Institute", + "Abstract": "We introduce a generic, language-independent method to collect a large percentage of offensive and hate tweets regardless of their topics or genres. We harness the extralinguistic information embedded in the emojis to collect a large number of offensive tweets. We apply the proposed method on Arabic tweets and compare it with English tweets -- analyzing some cultural differences. We observed a constant usage of these emojis to represent offensiveness in throughout different timelines in Twitter. We manually annotate and publicly release the largest Arabic dataset for offensive, fine-grained hate speech, vulgar and violence content. Furthermore, we benchmark the dataset for detecting offense and hate speech using different transformer architectures and performed in-depth linguistic analysis. We evaluate our models on external datasets -- a Twitter dataset collected using a completely different method, and a multi-platform dataset containing comments from Twitter, YouTube and Facebook, for assessing generalization capability. Competitive results on these datasets suggest that the data collected using our method captures universal characteristics of offensive language. Our findings also highlight the common words used in offensive communications; common targets for hate speech; specific patterns in violence tweets and pinpoints common classification errors due to the need to understand the context, consider culture and background and the presence of sarcasm among others.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/arabic_oscar.json b/datasets/arabic_oscar.json new file mode 100644 index 0000000..b7572b6 --- /dev/null +++ b/datasets/arabic_oscar.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic OSCAR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/oscar-corpus/oscar", + "Link": "https://oscar-corpus.com/", + "License": "CC0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "a huge multilingual corpus obtained by language classification and filtering of the Common Crawl ", + "Volume": "8,117,162,828", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Inria", + "Derived From": "Common Crawl", + "Paper Title": "A Monolingual Approach to Contextualized Word Embeddings\r\nfor Mid-Resource Languages", + "Paper Link": "https://arxiv.org/pdf/2006.06202.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "ACL", + "Citations": "39.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Pedro Javier Ortiz Su\u00e1rez,L. Romary,Beno\u00eet Sagot", + "Affiliations": "Inria;Sorbonne Universit\u00e9,,", + "Abstract": "We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_pos_dialect.json b/datasets/arabic_pos_dialect.json new file mode 100644 index 0000000..c3babee --- /dev/null +++ b/datasets/arabic_pos_dialect.json @@ -0,0 +1,61 @@ +{ + "Name": "Arabic POS Dialect", + "Subsets": [ + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "350", + "Unit": "sentences" + }, + { + "Name": "Levantine", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "350", + "Unit": "sentences" + }, + { + "Name": "Gulf", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "350", + "Unit": "sentences" + }, + { + "Name": "Maghrebi", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "350", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/QCRI/arabic_pos_dialect", + "Link": "https://github.com/qcri/dialectal_arabic_resources", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "includes tweets in Egyptian, Levantine, Gulf, and Maghrebi, with 350 tweets for each dialect with appropriate train/test/development splits for 5-fold cross validation", + "Volume": "1,400", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Multi-Dialect Arabic POS Tagging: A CRF Approach\r", + "Paper Link": "https://aclanthology.org/L18-1015.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "part of speech tagging", + "Venue Title": "LREC", + "Citations": "17.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Kareem Darwish,Hamdy Mubarak,Ahmed Abdelali,M. Eldesouki,Younes Samih,Randah Alharbi,Mohammed Attia,Walid Magdy,Laura Kallmeyer", + "Affiliations": ",,,,University Of D\u00fcsseldorf;Computational Linguistics,,,The University of Edinburgh,", + "Abstract": "This paper introduces a new dataset of POS-tagged Arabic tweets in four major dialects along with tagging guidelines. The data, which we are releasing publicly, includes tweets in Egyptian, Levantine, Gulf, and Maghrebi, with 350 tweets for each dialect with appropriate train/test/development splits for 5-fold cross validation. We use a Conditional Random Fields (CRF) sequence labeler to train POS taggers for each dialect and examine the effect of cross and joint dialect training, and give benchmark results for the datasets. Using clitic n-grams, clitic metatypes, and stem templates as features, we were able to train a joint model that can correctly tag four different dialects with an average accuracy of 89.3%.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_punctuation_dataset.json b/datasets/arabic_punctuation_dataset.json new file mode 100644 index 0000000..d509256 --- /dev/null +++ b/datasets/arabic_punctuation_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Punctuation Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CBT,https://hf.co/datasets/arbml/ABC,https://hf.co/datasets/arbml/SSAC_UNPC", + "Link": "https://data.mendeley.com/datasets/2pkxckwgs3/1", + "License": "CC BY 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "This is a curated dataset, specifically designed to facilitate the study of punctuation. It has undergone rigorous manual annotation and verification on the basis of sentence structure, with sentence boundaries clearly marked. ", + "Volume": "12,183,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Sharjah, University of Waikato", + "Derived From": "nan", + "Paper Title": "Arabic punctuation dataset", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S2352340924000908", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "punctuation detection", + "Venue Title": "Data in Brief", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Data in Brief", + "Authors": "nan", + "Affiliations": "Sane Yagi, Ashraf Elnagar, Esra Yaghi", + "Abstract": "Arabic, unlike many languages, suffers from punctuation inconsistency, posing a significant obstacle for Natural Language Processing (NLP). To address this, we present the Arabic Punctuation Dataset (APD), a large collection of annotated Modern Standard Arabic texts designed to train machine learning models in sentence boundary identification and punctuation prediction. APD leverages the \u201ctheme-rheme completion\u201d principle, a grammatical feature closely linked to consistent punctuation placement. It consists of an annotated collection of Modern Standard Arabic (MSA) texts that encompass 312 million words in approximately 12 million sentences. It comprises three diverse components: Arabic Book Chapters (ABC): Manually annotated, non-fiction, book excerpts, constituting a gold-standard reference. Complete Book Translations (CBT): Parallel English\u2013Arabic book translations with aligned sentence endings, ideal for machine translation training. Scrambled Sentences from the Arabic Component of the United Nations Parallel Corpus (SSAC-UNPC): Jumbled sentences for model training in automatic punctuation restoration. Beyond NLP, APD serves as a valuable resource for linguistics research, language learning, and real-time subtitling. Its authentic, grammar-based approach can enhance the readability and clarity of machine-generated text, opening doors for various applications such as automatic speech recognition, text summarization, and machine translation.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_rc_datasets.json b/datasets/arabic_rc_datasets.json new file mode 100644 index 0000000..5a2e6be --- /dev/null +++ b/datasets/arabic_rc_datasets.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic RC datasets", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_RC_AQA", + "Link": "https://github.com/MariamBiltawi/Arabic_RC_datasets", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arabic Reading Comprehension Benchmarks Created Semiautomatically", + "Volume": "2,862", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "PSUT", + "Derived From": "AQA (1008), Modified TREC (979), Modified CLEF (418)", + "Paper Title": "Arabic Reading Comprehension Benchmarks Created Semiautomatically", + "Paper Link": "https://www.semanticscholar.org/paper/Arabic-Reading-Comprehension-Benchmarks-Created-Biltawi-Awajan/e637ba939d78e6027bbcc8445f93605d36436421", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "ACIT", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Arab Conference on Information Technology", + "Authors": "Mariam Biltawi,A. Awajan,Sara Tedmori", + "Affiliations": ",,", + "Abstract": "Reading comprehension is the task of answering questions from paragraphs; it is also considered a subtask of question-answering systems. Although Arabic language is a language spoken by more than 330 million native speakers, it lacks the required resources, which are needed by the Arabic reading comprehension task to serve as a benchmark dataset. The goal of this work is to present the phases of creating Arabic reading comprehension benchmark dataset semiautomatically. The phases include; data collection, manual check, Google search, document retrieval, and paragraph retrieval. The paper also conducts a thorough evaluation for the created datasets.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_satire_dataset.json b/datasets/arabic_satire_dataset.json new file mode 100644 index 0000000..8c44bba --- /dev/null +++ b/datasets/arabic_satire_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic satire dataset ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arbic-satire-dataset", + "Link": "https://github.com/Noza1234/Arbic-satire-dataset", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "500 Arabic news and 500 Arabic satire articles ", + "Volume": "1,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Comprehensive Study of Arabic Satirical Article Classification", + "Paper Link": "https://www.mdpi.com/2076-3417/13/19/10616", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, information retrieval, fake news detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Applied Sciences", + "Authors": "Fatmah Assiri, Hanen Himdi", + "Affiliations": "nan", + "Abstract": "A well-known issue for social media sites consists of the hazy boundaries between malicious false news and protected speech satire. In addition to the protective measures that lessen the exposure of false material on social media, providers of fake news have started to pose as satire sites in order to escape being delisted. Potentially, this may cause confusion to the readers as satire can sometimes be mistaken for real news, especially when their context or intent is not clearly understood and written in a journalistic format imitating real articles. In this research, we tackle the issue of classifying Arabic satiric articles written in a journalistic format to detect satirical cues that aid in satire classification. To accomplish this, we compiled the first Arabic satirical articles dataset extracted from real-world satirical news platforms. Then, a number of classification models that integrate a variety of feature extraction techniques with machine learning, deep learning, and transformers to detect the provenance of linguistic and semantic cues were investigated, including the first use of the ArabGPt model. Our results indicate that BERT is the best-performing model with F1-score reaching 95%. We also provide an in-depth lexical analysis of the formation of Arabic satirical articles. The lexical analysis provides insights into the satirical nature of the articles in terms of their linguistic word uses. Finally, we developed a free open-source platform that automatically organizes satirical and non-satirical articles in their correct classes from the best-performing model in our study, BERT. In summary, the obtained results found that pretrained models gave promising results in classifying Arabic satirical articles.", + "Added By": "Hanen Himdi" +} \ No newline at end of file diff --git a/datasets/arabic_satirical_fake_news_dataset.json b/datasets/arabic_satirical_fake_news_dataset.json new file mode 100644 index 0000000..a59baa0 --- /dev/null +++ b/datasets/arabic_satirical_fake_news_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Satirical Fake News Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Satirical_Fake_News", + "Link": "https://github.com/sadanyh/Arabic-Satirical-Fake-News-Dataset", + "License": "CC BY 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "A Study of Arabic Satirical Fake News", + "Volume": "6,895", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "ComVE", + "Paper Title": "Fake or Real? A Study of Arabic Satirical Fake News\r", + "Paper Link": "https://arxiv.org/pdf/2011.00452.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "fake news detection", + "Venue Title": "RDSM", + "Citations": "4.0", + "Venue Type": "workshop", + "Venue Name": "International Workshop on Rumours and Deception in Social Media", + "Authors": "Hadeel Saadany,Emad Mohamed,Constantin Orasan", + "Affiliations": ",,University of Surrey, UK", + "Abstract": "One very common type of fake news is satire which comes in a form of a news website or an online platform that parodies reputable real news agencies to create a sarcastic version of reality. This type of fake news is often disseminated by individuals on their online platforms as it has a much stronger effect in delivering criticism than through a straightforward message. However, when the satirical text is disseminated via social media without mention of its source, it can be mistaken for real news. This study conducts several exploratory analyses to identify the linguistic properties of Arabic fake news with satirical content. It shows that although it parodies real news, Arabic satirical news has distinguishing features on the lexico-grammatical level. We exploit these features to build a number of machine learning models capable of identifying satirical fake news with an accuracy of up to 98.6%. The study introduces a new dataset (3185 articles) scraped from two Arabic satirical news websites (\u2018Al-Hudood\u2019 and \u2018Al-Ahram Al-Mexici\u2019) which consists of fake news. The real news dataset consists of 3710 articles collected from three official news sites: the \u2018BBC-Arabic\u2019, the \u2018CNN-Arabic\u2019 and \u2018Al-Jazeera news\u2019. Both datasets are concerned with political issues related to the Middle East.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_senti-lexicon.json b/datasets/arabic_senti-lexicon.json new file mode 100644 index 0000000..c417a43 --- /dev/null +++ b/datasets/arabic_senti-lexicon.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic senti-lexicon", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Senti_Lexicon", + "Link": "https://github.com/almoslmi/masc", + "License": "custom", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "a list of 3880 positive and negative synsets annotated with their part of speech, polarity scores, dialects synsets and inflected forms", + "Volume": "3,880", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Arabic senti-lexicon: Constructing publicly available language resources for Arabic sentiment analysis", + "Paper Link": "https://journals.sagepub.com/doi/full/10.1177/0165551516683909", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "part of speech tagging, sentiment analysis", + "Venue Title": "JIS", + "Citations": "54.0", + "Venue Type": "journal", + "Venue Name": "Journal of Information Science", + "Authors": "Tareq Al-Moslmi,M. Albared,Adel Al-Shabi,N. Omar,S. Abdullah", + "Affiliations": ",,,,", + "Abstract": "Sentiment analysis is held to be one of the highly dynamic recent research fields in Natural Language Processing, facilitated by the quickly growing volume of Web opinion data. Most of the approaches in this field are focused on English due to the lack of sentiment resources in other languages such as the Arabic language and its large variety of dialects. In most sentiment analysis applications, good sentiment resources play a critical role. Based on that, in this article, several publicly available sentiment analysis resources for Arabic are introduced. This article introduces the Arabic senti-lexicon, a list of 3880 positive and negative synsets annotated with their part of speech, polarity scores, dialects synsets and inflected forms. This article also presents a Multi-domain Arabic Sentiment Corpus (MASC) with a size of 8860 positive and negative reviews from different domains. In this article, an in-depth study has been conducted on five types of feature sets for exploiting effective features and investigating their effect on performance of Arabic sentiment analysis. The aim is to assess the quality of the developed language resources and to integrate different feature sets and classification algorithms to synthesise a more accurate sentiment analysis method. The Arabic senti-lexicon is used for generating feature vectors. Five well-known machine learning algorithms: na\u00efve Bayes, k-nearest neighbours, support vector machines (SVMs), logistic linear regression and neural network are employed as base-classifiers for each of the feature sets. A wide range of comparative experiments on standard Arabic data sets were conducted, discussion is presented and conclusions are drawn. The experimental results show that the Arabic senti-lexicon is a very useful resource for Arabic sentiment analysis. Moreover, results show that classifiers which are trained on feature vectors derived from the corpus using the Arabic sentiment lexicon are more accurate than classifiers trained using the raw corpus.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_sentiment_lexicons.json b/datasets/arabic_sentiment_lexicons.json new file mode 100644 index 0000000..4c7e004 --- /dev/null +++ b/datasets/arabic_sentiment_lexicons.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Sentiment Lexicons ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Sentiment_Lexicons", + "Link": "https://saifmohammad.com/WebPages/ArabicSA.html", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": " by using distant supervision techniques on Arabic tweets, and by translating English sentiment lexicons into Arabic using a freely available statistical machine translation system", + "Volume": "176,364", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Sentiment Lexicons for Arabic Social Media\r", + "Paper Link": "https://aclanthology.org/L16-1006.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "LREC", + "Citations": "51.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Saif M. Mohammad,Mohammad Salameh,Svetlana Kiritchenko", + "Affiliations": "National Research Council Canada,,", + "Abstract": "Existing Arabic sentiment lexicons have low coverage\u2015with only a few thousand entries. In this paper, we present several large sentiment lexicons that were automatically generated using two different methods: (1) by using distant supervision techniques on Arabic tweets, and (2) by translating English sentiment lexicons into Arabic using a freely available statistical machine translation system. We compare the usefulness of new and old sentiment lexicons in the downstream application of sentence-level sentiment analysis. Our baseline sentiment analysis system uses numerous surface form features. Nonetheless, the system benefits from using additional features drawn from sentiment lexicons. The best result is obtained using the automatically generated Dialectal Hashtag Lexicon and the Arabic translations of the NRC Emotion Lexicon (accuracy of 66.6%). Finally, we describe a qualitative study of the automatic translations of English sentiment lexicons into Arabic, which shows that about 88% of the automatically translated entries are valid for English as well. Close to 10% of the invalid entries are caused by gross mistranslations, close to 40% by translations into a related word, and about 50% by differences in how the word is used in Arabic.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_sentiment_twitter_corpus.json b/datasets/arabic_sentiment_twitter_corpus.json new file mode 100644 index 0000000..e23621c --- /dev/null +++ b/datasets/arabic_sentiment_twitter_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Sentiment Twitter Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabic_Sentiment_Twitter_Corpus", + "Link": "https://www.kaggle.com/datasets/mksaad/arabic-sentiment-twitter-corpus", + "License": "custom", + "Year": 2019, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A Sentiment Analysis dataset. No extra information is provided regarding the dialects nor the collection methodology", + "Volume": "58,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Motaz Saad", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Maged S. Alshaibani" +} \ No newline at end of file diff --git a/datasets/arabic_sms_chat.json b/datasets/arabic_sms_chat.json new file mode 100644 index 0000000..dc988f3 --- /dev/null +++ b/datasets/arabic_sms_chat.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic SMS/Chat ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017T07", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "5,691 conversations totaling 1,029,248 words across 262,026 messages. Messages were natively written in either Arabic orthography or romanized Arabizi", + "Volume": "262,026", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "Transliteration of Arabizi into Arabic Orthography: Developing a\r\nParallel Annotated Arabizi-Arabic Script SMS/Chat Corpus", + "Paper Link": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/emnlp2014-transliteration-aribizi-into-arabic-orthography.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1750 $", + "Test Split": "No", + "Tasks": "transliteration, machine translation", + "Venue Title": "WANLP", + "Citations": "39.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing workshop", + "Authors": "Ann Bies,Zhiyi Song,M. Maamouri,Stephen Grimes,Haejoong Lee,Jonathan Wright,S. Strassel,Nizar Habash,R. Eskander,Owen Rambow", + "Affiliations": ",,,,,,,,,", + "Abstract": "This paper describes the process of creating a novel resource, a parallel Arabizi-Arabic script corpus of SMS/Chat data. The language used in social media expresses many differences from other written genres: its vocabulary is informal with intentional deviations from standard orthography such as repeated letters for emphasis; typos and nonstandard abbreviations are common; and nonlinguistic content is written out, such as laughter, sound representations, and emoticons. This situation is exacerbated in the case of Arabic social media for two reasons. First, Arabic dialects, commonly used in social media, are quite different from Modern Standard Arabic phonologically, morphologically and lexically, and most importantly, they lack standard orthographies. Second, Arabic speakers in social media as well as discussion forums, SMS messaging and online chat often use a non-standard romanization called Arabizi. In the context of natural language processing of social media Arabic, transliterating from Arabizi of various dialects to Arabic script is a necessary step, since many of the existing state-of-the-art resources for Arabic dialect processing expect Arabic script input. The corpus described in this paper is expected to support Arabic NLP by providing this resource.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_spam_and_ham_tweets.json b/datasets/arabic_spam_and_ham_tweets.json new file mode 100644 index 0000000..e897aa2 --- /dev/null +++ b/datasets/arabic_spam_and_ham_tweets.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Spam and Ham Tweets", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/arabic_spam_ham_twitter", + "Link": "https://data.mendeley.com/datasets/86x733xkb8/2", + "License": "CC BY 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The dataset contains 13241 records. Each record represents a tweet. The tweets are labeled either Ham or Spam. Ham means non-spam tweet. There are 1924 Spam tweets and 11299 Ham tweets. The tweets are unique i.e. there are no repeated tweets records.", + "Volume": "13,241", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Dataset of Arabic spam and ham tweets", + "Paper Link": "https://data.mendeley.com/datasets/86x733xkb8/2", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "spam detection", + "Venue Title": "Data in Brief", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Data in Brief", + "Authors": "Sanaa Kaddoura, Safaa Henno", + "Affiliations": "nan", + "Abstract": "This data article provides a dataset of 132421 posts and their corresponding information collected from Twitter social media. The data has two classes, ham or spam, where ham indicates non-spam clean tweets. The main target of this dataset is to study a way to classify whether a post is a spam or not automatically. The data is in Arabic language only, which makes the data essential to the researchers in Arabic natural language processing (NLP) due to the lack of resources in this language. The data is made publicly available to allow researchers to use it as a benchmark for their research in Arabic NLP. The dataset was collected using the Twitter REST API between January 27, 2021, and March 10, 2021. An ad-hoc crawler was constructed using Python programming language to collect the data. Many scientists and researchers will benefit from this dataset in the domain of cybersecurity, NLP, data science and social networking analysis.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_speech_commands_dataset.json b/datasets/arabic_speech_commands_dataset.json new file mode 100644 index 0000000..7d1089c --- /dev/null +++ b/datasets/arabic_speech_commands_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Speech Commands Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Speech_Commands_Dataset", + "Link": "https://github.com/abdulkaderghandoura/arabic-speech-commands-dataset", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "This dataset is designed to help train simple machine learning models that serve educational and research purposes in the speech recognition domain", + "Volume": "3", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Building and benchmarking an Arabic Speech Commands dataset for small-footprint keyword spotting", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S0952197621001147", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "EAAI ", + "Citations": "0.0", + "Venue Type": "journal", + "Venue Name": "Engineering Applications of Artificial Intelligence", + "Authors": "Abdulkader Ghandoura,Farouk Hjabo,Oumayma Al Dakkak", + "Affiliations": ",,", + "Abstract": "The introduction of the Google Speech Commands dataset accelerated research and resulted in a variety of new deep learning approaches that address keyword spotting tasks. The main contribution of this work is the building of an Arabic Speech Commands dataset, a counterpart to Google\u2019s dataset. Our dataset consists of 12000 instances, collected from 30 contributors, and grouped into 40 keywords. We also report different experiments to benchmark this dataset using classical machine learning and deep learning approaches, the best of which is a Convolutional Neural Network with Mel-Frequency Cepstral Coefficients that achieved an accuracy of 98%. Additionally, we point out some key ideas to be considered in such tasks.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_speech_corpus.json b/datasets/arabic_speech_corpus.json new file mode 100644 index 0000000..0fc9ac3 --- /dev/null +++ b/datasets/arabic_speech_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Speech Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/halabi2016/arabic_speech_corpus", + "Link": "http://en.arabicspeechcorpus.com/", + "License": "CC BY 4.0 ", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "The corpus was recorded in south Levantine Arabic (Damascian accent) using a professional studio.", + "Volume": "4", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "SOUTHAMPTON University", + "Derived From": "nan", + "Paper Title": "Modern Standard Arabic Speech Corpus\r", + "Paper Link": "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/2561/arabic-speech-corpus-report.pdf?sequence=3", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "other", + "Citations": "3.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Nawar Halabi,Gary B Wills", + "Affiliations": ",", + "Abstract": "Corpus design for speech synthesis is a well-researched topic in languages such as English\r\ncompared to Modern Standard Arabic, and there is a tendency to focus on methods to automatically\r\ngenerate the orthographic transcript to be recorded (usually greedy methods), which was used in\r\nthis work. In this work, a study of Modern Standard Arabic (MSA) phonetics and phonology is\r\nconducted in order to develop criteria for a greedy method to create a MSA speech corpus\r\ntranscript for recording. The size of the dataset is reduced a number of times using optimisation\r\nmethods with different parameters to yield a much smaller dataset with the identical phonetic\r\ncoverage offered before the reduction. The resulting output transcript is then chosen for recording.\r\nA phoneme set and a phonotactic rule-set are created for automatically generating a phonetic\r\ntranscript of normalised MSA text which is used to annotate and segment the speech corpus after\r\nrecording, achieving 82.5% boundary precision with some manual alignments (~15% of the\r\ncorpus) to increase the precision of the automatic alignment. This is part of a larger work to create\r\na completely annotated and segmented speech corpus for MSA speech synthesis with an evaluation\r\nof the quality of this speech corpus and, where possible, the quality of each stage in the process.\r", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_speech_recognition_pronunciation_dictionary.json b/datasets/arabic_speech_recognition_pronunciation_dictionary.json new file mode 100644 index 0000000..02d481f --- /dev/null +++ b/datasets/arabic_speech_recognition_pronunciation_dictionary.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Speech Recognition Pronunciation Dictionary", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017L01", + "License": "LDC User Agreement for Non-Members", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The dictionary was developed from news archive resources, including the Arabic news website Aljazeera.net. The selected words were those that occurred more than once in the news collection. The text was processed using MADA.", + "Volume": "526,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "200.00 $", + "Test Split": "No", + "Tasks": "language teaching,speech recognition,pronunciation modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_stop_words.json b/datasets/arabic_stop_words.json new file mode 100644 index 0000000..882317a --- /dev/null +++ b/datasets/arabic_stop_words.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Stop words", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/arabic_stop_words", + "Link": "https://github.com/mohataher/arabic-stop-words", + "License": "MIT License", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Largest list of Arabic stop words on Github.", + "Volume": "750", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mohamed Taher Alrefaie, Tarek BAZINE", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/arabic_text_diacritization.json b/datasets/arabic_text_diacritization.json new file mode 100644 index 0000000..2b6a8d5 --- /dev/null +++ b/datasets/arabic_text_diacritization.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Text Diacritization", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/arabic_text_diacritization", + "Link": "https://github.com/AliOsm/arabic-text-diacritization", + "License": "MIT License", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Arabic Text Diacritization dataset", + "Volume": "55,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "JUST", + "Derived From": "nan", + "Paper Title": "Arabic Text Diacritization Using Deep Neural\nNetworks", + "Paper Link": "https://arxiv.org/pdf/1905.01965.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "diacritization", + "Venue Title": "ICCAIS", + "Citations": "12.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer Applications & Information Security", + "Authors": "Ali Fadel,Ibraheem Tuffaha,Bara' Al-Jawarneh,M. Al-Ayyoub", + "Affiliations": ",,,", + "Abstract": "Diacritization of Arabic text is both an interesting and a challenging problem at the same time with various applications ranging from speech synthesis to helping students learning the Arabic language. Like many other tasks or problems in Arabic language processing, the weak efforts invested into this problem and the lack of available (open-source) resources hinder the progress towards solving this problem. This work provides a critical review for the currently existing systems, measures and resources for Arabic text diacritization. Moreover, it introduces a much-needed free-for-all cleaned dataset that can be easily used to benchmark any work on Arabic diacritization. Extracted from the Tashkeela Corpus, the dataset consists of 55K lines containing about 2.3M words. After constructing the dataset, existing tools and systems are tested on it. The results of the experiments show that the neural Shakkala system significantly outperforms traditional rule-based approaches and other closed-source tools with a Diacritic Error Rate (DER) of 2.88% compared with 13.78%, which the best DER for the non-neural approach (obtained by the Mishkal tool).", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabic_textual_entailment_dataset.json b/datasets/arabic_textual_entailment_dataset.json new file mode 100644 index 0000000..4fe614b --- /dev/null +++ b/datasets/arabic_textual_entailment_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Textual Entailment Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArabicTE", + "Link": "http://www.cs.man.ac.uk/~ramsay/ArabicTE/", + "License": "unknown", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "manual curation", + "Description": "This dataset contains 600 Arabic premise:hypothesis pairs. ", + "Volume": "600", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Natural Language Inference for Arabic Using Extended Tree\r\nEdit Distance with Subtrees", + "Paper Link": "https://arxiv.org/ftp/arxiv/papers/1402/1402.0578.pdf", + "Script": "Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "natural language inference", + "Venue Title": "JAIR", + "Citations": "21.0", + "Venue Type": "journal", + "Venue Name": "Journal of Artificial Intelligence Research", + "Authors": "Maytham Alabbas,A. Ramsay", + "Affiliations": ",", + "Abstract": "Many natural language processing (NLP) applications require the computation of similarities between pairs of syntactic or semantic trees. Many researchers have used tree edit distance for this task, but this technique suffers from the drawback that it deals with single node operations only. We have extended the standard tree edit distance algorithm to deal with subtree transformation operations as well as single nodes. The extended algorithm with subtree operations, TED+ST, is more effective and flexible than the standard algorithm, especially for applications that pay attention to relations among nodes (e.g. in linguistic trees, deleting a modifier subtree should be cheaper than the sum of deleting its components individually). We describe the use of TED+ST for checking entailment between two Arabic text snippets. The preliminary results of using TED+ST were encouraging when compared with two string-based approaches and with the standard algorithm.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank_-_broadcast_news_v1_0.json b/datasets/arabic_treebank_-_broadcast_news_v1_0.json new file mode 100644 index 0000000..9d9a009 --- /dev/null +++ b/datasets/arabic_treebank_-_broadcast_news_v1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank - Broadcast News v1.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012T07", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This release contains 432,976 source tokens before clitics were split, and 517,080 tree tokens after clitics were separated for treebank annotation. The source materials are Arabic broadcast news stories collected by LDC during the period 2005-2008 from the following sources: Abu Dhabi TV, Al Alam News Channel, Al Arabiya, Al Baghdadya TV, Al Fayha, Alhurra, Al Iraqiyah, Aljazeera, Al Ordiniyah, Al Sharqiyah, Dubai TV, Kuwait TV, Lebanese Broadcasting Corp., Oman TV, Radio Sawa, Saudi TV and Syria TV. The transcripts were produced by LDC.", + "Volume": "432,976", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,500.00 $", + "Test Split": "No", + "Tasks": "cross-lingual information retrieval,information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank_-_weblog.json b/datasets/arabic_treebank_-_weblog.json new file mode 100644 index 0000000..df95dea --- /dev/null +++ b/datasets/arabic_treebank_-_weblog.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank - Weblog", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T02", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "This release contains 243,117 source tokens before clitics were split, and 308,996 tree tokens after clitics were separated for treebank annotation. The source material is weblogs collected by LDC from various sources.", + "Volume": "243,117", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "University of Pennsylvania", + "Derived From": "nan", + "Paper Title": "Consistent and Flexible Integration of Morphological Annotation in the Arabic Treebank ", + "Paper Link": "https://catalog.ldc.upenn.edu/docs/LDC2016T02/KulickBiesMaamouri-LREC2010.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,500.00 $", + "Test Split": "No", + "Tasks": "part of speech tagging", + "Venue Title": "LREC", + "Citations": "24.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation (LREC 2010). ", + "Authors": "Mohamed Maamouri, Ann Bies, Seth Kulick, Sondos Krouna, Dalila Tabassi, Michael Ciul", + "Affiliations": "Linguistic Data Consortium - University of Pennsylvania", + "Abstract": "Complications arise for standoff annotation when the annotation is not on the source text itself, but on a more abstract representation.\nThis is particularly the case in a language such as Arabic with morphological and orthographic challenges, and we discuss various\naspects of these issues in the context of the Arabic Treebank. The Standard Arabic Morphological Analyzer (SAMA) is closely\nintegrated into the annotation workflow, as the basis for the abstraction between the explicit source text and the more abstract token\nrepresentation. However, this integration with SAMA gives rise to various problems for the annotation workflow and for maintaining\nthe link between the Treebank and SAMA. In this paper we discuss how we have overcome these problems with consistent and more\nprecise categorization of all of the tokens for their relationship with SAMA. We also discuss how we have improved the creation of\nseveral distinct alternative forms of the tokens used in the syntactic trees. As a result, the Treebank provides a resource relating the\ndifferent forms of the same underlying token with varying degrees of vocalization, in terms of how they relate (1) to each other, (2) to\nthe syntactic structure, and (3) to the morphological analyzer. ", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_1_-_10k-word_english_translation.json b/datasets/arabic_treebank__part_1_-_10k-word_english_translation.json new file mode 100644 index 0000000..f947f02 --- /dev/null +++ b/datasets/arabic_treebank__part_1_-_10k-word_english_translation.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 1 - 10K-word English Translation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2003T07", + "License": "LDC User Agreement for Non-Members", + "Year": 2003, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The project targets the translation of a written Modern Standard Arabic corpus from the Agence France Presse (AFP) newswire archives for July 2000 (the files are dated 07/15/2000). The corpus consists of 49 source stories, which is a subset of the 734 stories published in Arabic Treebank: Part 1 v 2.0 (LDC2003T06). These 49 source files comprise 418 paragraphs and 9,981 words.", + "Volume": "9,981", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_1_v_2_0.json b/datasets/arabic_treebank__part_1_v_2_0.json new file mode 100644 index 0000000..81f11fb --- /dev/null +++ b/datasets/arabic_treebank__part_1_v_2_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 1 v 2.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2003T06", + "License": "LDC User Agreement for Non-Members", + "Year": 2003, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The following table gives a breakdown of the data contained in the entire Arabic Treebank project, with discrepancies between versions for Parts 1, 2, and 3. The fields include source, number of stories, total number of tokens, number of tokens after clitic separation, and number of Arabic word tokens after punctuation, numbers, and Latin strings have been taken out. The totals given at the bottom are calculated from the latest versions where discrepancies exist, and do not include tokens after clitic separation since that number is missing from Part 4.", + "Volume": "688,549", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_1_v_3_0_(pos_with_full_vocalization_+_syntactic_analysis).json b/datasets/arabic_treebank__part_1_v_3_0_(pos_with_full_vocalization_+_syntactic_analysis).json new file mode 100644 index 0000000..0c42a76 --- /dev/null +++ b/datasets/arabic_treebank__part_1_v_3_0_(pos_with_full_vocalization_+_syntactic_analysis).json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 1 v 3.0 (POS with full vocalization + syntactic analysis)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005T02", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The following table gives a breakdown of the data contained in the entire Arabic Treebank project, with discrepancies between versions for Parts 1 and 3. The fields include source, number of stories, total number of tokens, number of tokens after clitic separation, and number of Arabic word tokens after punctuation, numbers, and latin strings have been taken out. The totals given at the bottom are calculated from the latest versions where discrepencies exist, and do not include tokens after clitic separation since that number is missing from Part 4.", + "Volume": "2,231", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_1_v_4_1.json b/datasets/arabic_treebank__part_1_v_4_1.json new file mode 100644 index 0000000..96cfdf0 --- /dev/null +++ b/datasets/arabic_treebank__part_1_v_4_1.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 1 v 4.1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T13", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "ATB1 v 4.1 contains a total of 145,386 tokens before clitics are split, and 167,280 tokens after clitics are separated for the treebank annotation.", + "Volume": "145,386", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_2_v_2_0.json b/datasets/arabic_treebank__part_2_v_2_0.json new file mode 100644 index 0000000..9a0858d --- /dev/null +++ b/datasets/arabic_treebank__part_2_v_2_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 2 v 2.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2004T02", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The following table gives a breakdown of the data contained in the entire Arabic Treebank project, with discrepancies between versions for Parts 1, 2, and 3. The fields include source, number of stories, total number of tokens, number of tokens after clitic separation, and number of Arabic word tokens after punctuation, numbers, and latin strings have been taken out. The totals given at the bottom are calculated from the latest versions where discrepencies exist, and do not include tokens after clitic separation since that number is missing from Part 4.", + "Volume": "688,549", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_2_v_3_1.json b/datasets/arabic_treebank__part_2_v_3_1.json new file mode 100644 index 0000000..13e67e1 --- /dev/null +++ b/datasets/arabic_treebank__part_2_v_3_1.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 2 v 3.1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "ATB2 v 3.1 contains a total of 144,199 source tokens before clitics are split, and 169,319 tree tokens after clitics are separated for the treebank annotation. Source texts were selected from Ummah Press news archives covering the period from July 2001 through September 2002.", + "Volume": "501", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_3.json b/datasets/arabic_treebank__part_3.json new file mode 100644 index 0000000..0eaf1ca --- /dev/null +++ b/datasets/arabic_treebank__part_3.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 3", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005T20", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "manual curation", + "Description": "This dataset is a collection of 300,00 Arabic tokens with their syntactic treebank annotation and annotation on part of speech (POS), gloss, and word segmentation.", + "Volume": "300,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "part of the dataset was derived from Arabic Gigaword (LDC2003T12)", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval, cross-lingual information retrieval, information detection", + "Venue Title": "nan", + "Citations": "24.0", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mohamed Maamouri, Ann Bies, Tim Buckwalter, Hubert Jin, Wigdan Mekki", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Maged S. Alshaibani" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_3_v_1_0.json b/datasets/arabic_treebank__part_3_v_1_0.json new file mode 100644 index 0000000..6dd43c4 --- /dev/null +++ b/datasets/arabic_treebank__part_3_v_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 3 v 1.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2004T11", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The following table gives a breakdown of the data contained in the entire Arabic Treebank project, with discrepancies between versions for Parts 1 and 3. The fields include source, number of stories, total number of tokens, number of tokens after clitic separation, and number of Arabic word tokens after punctuation, numbers, and latin strings have been taken out. The totals given at the bottom are calculated from the latest versions where discrepencies exist, and do not include tokens after clitic separation since that number is missing from Part 4.", + "Volume": "300,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_3_v_3_2.json b/datasets/arabic_treebank__part_3_v_3_2.json new file mode 100644 index 0000000..c74da4a --- /dev/null +++ b/datasets/arabic_treebank__part_3_v_3_2.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 3 v 3.2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T08", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "ATB3 v 3.2 contains a total of 339,710 tokens before clitics are split, and 402,291 tokens after clitics are separated for the treebank annotation. This release includes all files that were previously made available to the DARPA GALE program community (Arabic Treebank Part 3 - Version 3.1, LDC2008E22). A number of inconsistencies in the 3.1 release data have been corrected here. These include changes to certain POS tags with the resulting tree changes. As a result, additional clitics have been separated, and some previously incorrectly split tokens have now been merged.", + "Volume": "339,710", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_treebank__part_4_v_1_0_(mpg_annotation).json b/datasets/arabic_treebank__part_4_v_1_0_(mpg_annotation).json new file mode 100644 index 0000000..1d0d6d1 --- /dev/null +++ b/datasets/arabic_treebank__part_4_v_1_0_(mpg_annotation).json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Treebank: Part 4 v 1.0 (MPG Annotation)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005T30", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The following table gives a breakdown of the data contained in the entire Arabic Treebank project, with discrepancies between versions for Parts 1 and 3. The fields include source, number of stories, total number of tokens, number of tokens after clitic separation, and number of Arabic word tokens after punctuation, numbers, and latin strings have been taken out. The totals given at the bottom are calculated from the latest versions where discrepencies exist, and do not include tokens after clitic separation since that number is missing from Part 4.", + "Volume": "2,231", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_tweets_about_infectious_diseases.json b/datasets/arabic_tweets_about_infectious_diseases.json new file mode 100644 index 0000000..f6f3d2a --- /dev/null +++ b/datasets/arabic_tweets_about_infectious_diseases.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic tweets about infectious diseases", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/tweets_infectious_diseases", + "Link": "http://www.research.lancs.ac.uk/portal/en/datasets/arabic-tweets-about-infectious-diseases(68b307a8-510b-42f6-93af-f0cc7d9a75a1).html", + "License": "custom", + "Year": 2019, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This file contains a dataset of 1266 tweets by two Arabic native speakers into five types of sources: academic, media, government, health professional, and public.", + "Volume": "1,266", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Lancaster University", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabic_wiki_data_dump_2018.json b/datasets/arabic_wiki_data_dump_2018.json new file mode 100644 index 0000000..121558d --- /dev/null +++ b/datasets/arabic_wiki_data_dump_2018.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Wiki data Dump 2018", + "Subsets": [], + "HF Link": "nan", + "Link": "https://www.kaggle.com/datasets/abedkhooli/arabic-wiki-data-dump-2018", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "Arabic is a rich and major world language. Recent advances in computational linguistics and AI can be applied to Arabic but not in the generic way most languages are treated. This dataset (Arabic articles from Wikipedia) will be used to train Word2Vec and compare performance with publicly available pre-trained model from FastText (Facebook) in a generic way. A related model is now available: https://www.kaggle.com/abedkhooli/arabic-ulmfit-model", + "Volume": "nan", + "Unit": "documents", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "ABED KHOOLI ", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/arabic_wikipedia_20230101_bots.json b/datasets/arabic_wikipedia_20230101_bots.json new file mode 100644 index 0000000..c203e06 --- /dev/null +++ b/datasets/arabic_wikipedia_20230101_bots.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic_Wikipedia_20230101_bots", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/Arabic_Wikipedia_20230101_bots", + "Link": "https://hf.co/datasets/SaiedAlshahrani/Arabic_Wikipedia_20230101_bots", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Arabic_Wikipedia_20230101_bots is a dataset created using the Arabic Wikipedia articles, including the bot-generated articles, downloaded on the 1st of January 2023, and processed to train an Arabic RoBERTa model.", + "Volume": "1,000,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Clarkson University", + "Derived From": "Arabic Wikipedia Dump 2023-01-01", + "Paper Title": "Performance Implications of Using Unrepresentative Corpora in Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.19.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "ArabicNLP 2023", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The First Arabic Natural Language Processing Conference", + "Authors": "Saied Alshahrani, Norah Alshahrani, Soumyabrata Dey, Jeanna Matthews", + "Affiliations": "Clarkson University", + "Abstract": "Wikipedia articles are a widely used source of training data for Natural Language Processing (NLP) research, particularly as corpora for low-resource languages like Arabic. However, it is essential to understand the extent to which these corpora reflect the representative contributions of native speakers, especially when many entries in a given language are directly translated from other languages or automatically generated through automated mechanisms. In this paper, we study the performance implications of using inorganic corpora that are not representative of native speakers and are generated through automated techniques such as bot generation or automated template-based translation. The case of the Arabic Wikipedia editions gives a unique case study of this since the Moroccan Arabic Wikipedia edition (ARY) is small but representative, the Egyptian Arabic Wikipedia edition (ARZ) is large but unrepresentative, and the Modern Standard Arabic Wikipedia edition (AR) is both large and more representative. We intrinsically evaluate the performance of two main NLP upstream tasks, namely word representation and language modeling, using word analogy evaluations and fill-mask evaluations using our two newly created datasets: Arab States Analogy Dataset (ASAD) and Masked Arab States Dataset (MASD). We demonstrate that for good NLP performance, we need both large and organic corpora; neither alone is sufficient. We show that producing large corpora through automated means can be a counter-productive, producing models that both perform worse and lack cultural richness and meaningful representation of the Arabic language and its native speakers.", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/arabic_wikipedia_20230101_nobots.json b/datasets/arabic_wikipedia_20230101_nobots.json new file mode 100644 index 0000000..6f985f5 --- /dev/null +++ b/datasets/arabic_wikipedia_20230101_nobots.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic_Wikipedia_20230101_nobots", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/Arabic_Wikipedia_20230101_nobots", + "Link": "https://hf.co/datasets/SaiedAlshahrani/Arabic_Wikipedia_20230101_nobots", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Arabic_Wikipedia_20230101_nobots is a dataset created using the Arabic Wikipedia articles, excluding the bot-generated articles, downloaded on the 1st of January 2023, and processed to train an Arabic RoBERTa model.", + "Volume": "847,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Clarkson University", + "Derived From": "Arabic Wikipedia Dump 2023-01-01", + "Paper Title": "Performance Implications of Using Unrepresentative Corpora in Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.19.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "ArabicNLP 2023", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The First Arabic Natural Language Processing Conference", + "Authors": "Saied Alshahrani, Norah Alshahrani, Soumyabrata Dey, Jeanna Matthews", + "Affiliations": "Clarkson University", + "Abstract": "Wikipedia articles are a widely used source of training data for Natural Language Processing (NLP) research, particularly as corpora for low-resource languages like Arabic. However, it is essential to understand the extent to which these corpora reflect the representative contributions of native speakers, especially when many entries in a given language are directly translated from other languages or automatically generated through automated mechanisms. In this paper, we study the performance implications of using inorganic corpora that are not representative of native speakers and are generated through automated techniques such as bot generation or automated template-based translation. The case of the Arabic Wikipedia editions gives a unique case study of this since the Moroccan Arabic Wikipedia edition (ARY) is small but representative, the Egyptian Arabic Wikipedia edition (ARZ) is large but unrepresentative, and the Modern Standard Arabic Wikipedia edition (AR) is both large and more representative. We intrinsically evaluate the performance of two main NLP upstream tasks, namely word representation and language modeling, using word analogy evaluations and fill-mask evaluations using our two newly created datasets: Arab States Analogy Dataset (ASAD) and Masked Arab States Dataset (MASD). We demonstrate that for good NLP performance, we need both large and organic corpora; neither alone is sufficient. We show that producing large corpora through automated means can be a counter-productive, producing models that both perform worse and lack cultural richness and meaningful representation of the Arabic language and its native speakers.", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/arabic_wikipedia_talk_pages.json b/datasets/arabic_wikipedia_talk_pages.json new file mode 100644 index 0000000..ae183a9 --- /dev/null +++ b/datasets/arabic_wikipedia_talk_pages.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic Wikipedia Talk Pages", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/wikipedia_talks", + "Link": "https://github.com/michaelmilleryoder/wikipedia-codeswitching-data/", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "Comments on Arabic Wikipedia Talk pages that have some code-switching to latin.", + "Volume": "5259", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Code-Switching as a Social Act: The Case of Arabic Wikipedia Talk Pages", + "Paper Link": "https://aclanthology.org/W17-2911.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "code switching", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/arabic_wikireading_and_kaiflematha.json b/datasets/arabic_wikireading_and_kaiflematha.json new file mode 100644 index 0000000..62c4deb --- /dev/null +++ b/datasets/arabic_wikireading_and_kaiflematha.json @@ -0,0 +1,36 @@ +{ + "Name": "Arabic WikiReading and KaifLematha", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/esulaiman/Arabic-WikiReading-and-KaifLematha-datasets", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "high quality and large-scale Arabic reading comprehension datasets: Arabic WikiReading and KaifLematha with around +100 K instances.", + "Volume": "100,000", + "Unit": "documents", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "https://link.springer.com/content/pdf/10.1007/s10579-022-09577-5.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "reading comprehension", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Emad A. Alghamdi" +} \ No newline at end of file diff --git a/datasets/arabicaqa.json b/datasets/arabicaqa.json new file mode 100644 index 0000000..47b95ae --- /dev/null +++ b/datasets/arabicaqa.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicaQA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/abdoelsayed/Open-ArabicaQA", + "Link": "https://github.com/DataScienceUIBK/ArabicaQA", + "License": "MIT License", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "ArabicaQA is a robust dataset designed to support and advance the development of Arabic Question Answering (QA) systems. This dataset encompasses a wide range of question types, including both Machine Reading Comprehension (MRC) and Open-Domain questions, catering to various aspects of QA research and application. The dataset is structured to facilitate training, validation, and testing of Arabic QA models.", + "Volume": "88,946", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "ArabicaQA: A Comprehensive Dataset for Arabic Question", + "Paper Link": "https://arxiv.org/pdf/2403.17848v1.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Abdelrahman Abdallah , Mahmoud Kasem , Mahmoud Abdalla , Mohamed Mahmoud , Mohamed Elkasaby , Yasser Elbendary , Adam Jatowt\r", + "Affiliations": "nan", + "Abstract": "In this paper, we address the significant gap in Arabic natural\nlanguage processing (NLP) resources by introducing ArabicaQA,\nthe first large-scale dataset for machine reading comprehension and\nopen-domain question answering in Arabic. This comprehensive\ndataset, consisting of 89,095 answerable and 3,701 unanswerable\nquestions created by crowdworkers to look similar to answerable\nones, along with additional labels of open-domain questions marks\na crucial advancement in Arabic NLP resources. We also present\nAraDPR, the first dense passage retrieval model trained on the\nArabic Wikipedia corpus, specifically designed to tackle the unique\nchallenges of Arabic text retrieval. Furthermore, our study includes\nextensive benchmarking of large language models (LLMs) for Arabic\nquestion answering, critically evaluating their performance in the\nArabic language context. In conclusion, ArabicaQA, AraDPR, and\nthe benchmarking of LLMs in Arabic question answering offer\nsignificant advancements in the field of Arabic NLP. The dataset\nand code are publicly accessible for further research", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabichatespeechdataset.json b/datasets/arabichatespeechdataset.json new file mode 100644 index 0000000..1eb7b92 --- /dev/null +++ b/datasets/arabichatespeechdataset.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicHateSpeechDataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/sbalsefri/ArabicHateSpeechDataset ", + "License": "CC BY-SA 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The dataset contains 5361 Arabic tweets annotated with six categories: clean, offensive, and hateful speech (religion-based, ethnicity-based, nationality-based, gender-based). It focuses on Arabic dialects (Gulf and Modern Standard Arabic) and uses a three-level annotation schema.", + "Volume": "5361", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple INstitutions", + "Derived From": "nan", + "Paper Title": "Hate and Offensive Speech Detection on Arabic Social Media", + "Paper Link": "https://www.sciencedirect.com/science/article/abs/pii/S2468696420300379?fr=RR-2&ref=pdf_download&rr=8c11aa96ed17794c", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis, offensive language detection, hate speech detection, text classification", + "Venue Title": "OSNEM", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Online Social Networks and Media", + "Authors": "Safa Alsafari, Samira Sadaoui, Malek Mouhoub", + "Affiliations": "University of Regina, University of Jeddah", + "Abstract": "The paper proposes a framework for detecting hate and offensive speech in Arabic using machine learning and deep learning models. The dataset was collected from Twitter, focusing on Gulf Arabic and Modern Standard Arabic dialects, and includes annotations for religion, ethnicity, nationality, and gender-based hate speech.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arabicmmlu.json b/datasets/arabicmmlu.json new file mode 100644 index 0000000..a86e1b2 --- /dev/null +++ b/datasets/arabicmmlu.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicMMLU", + "Subsets": [], + "HF Link": "https://hf.co/datasets/MBZUAI/ArabicMMLU", + "Link": "https://hf.co/datasets/MBZUAI/ArabicMMLU", + "License": "CC BY-NC 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "ArabicMMLU is the first multi-task language understanding benchmark for Arabic language, sourced from school exams across diverse educational levels in different countries spanning North Africa, the Levant, and the Gulf regions. Our data comprises 40 tasks and 14,575 multiple-choice questions in Modern Standard Arabic (MSA), and is carefully constructed by collaborating with native speakers in the region.", + "Volume": "14,575", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "MBZUAI", + "Derived From": "nan", + "Paper Title": "ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic", + "Paper Link": "https://arxiv.org/pdf/2402.12840.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "multiple choice question answering", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Fajri Koto, Haonan Li, Sara Shatnawi, Jad Doughman, Abdelrahman Boda Sadallah, Aisha Alraeesi, Khalid Almubarak, Zaid Alyafeai, Neha Sengupta, Shady Shehata, Nizar Habash, Preslav Nakov, and Timothy Baldwin", + "Affiliations": "nan", + "Abstract": "The focus of language model evaluation has transitioned towards reasoning and knowledge-intensive tasks, driven by advancements in pretraining large models. While state-of-the-art models are partially trained on large Arabic texts, evaluating their performance in Arabic remains challenging due to the limited availability of relevant datasets. To bridge this gap, we present ArabicMMLU, the first multi-task language understanding benchmark for Arabic language, sourced from school exams across diverse educational levels in different countries spanning North Africa, the Levant, and the Gulf regions. Our data comprises 40 tasks and 14,575 multiple-choice questions in Modern Standard Arabic (MSA), and is carefully constructed by collaborating with native speakers in the region. Our comprehensive evaluations of 35 models reveal substantial room for improvement, particularly among the best open-source models. Notably, BLOOMZ, mT0, LLama2, and Falcon struggle to achieve a score of 50%, while even the top-performing Arabic-centric model only achieves a score of 62.3%.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabicqa_2_1m.json b/datasets/arabicqa_2_1m.json new file mode 100644 index 0000000..67ed818 --- /dev/null +++ b/datasets/arabicqa_2_1m.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicQA_2.1M", + "Subsets": [], + "HF Link": "https://hf.co/datasets/riotu-lab/ArabicQA_2.1M", + "Link": "https://hf.co/datasets/riotu-lab/ArabicQA_2.1M", + "License": "Apache-2.0", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Our dataset is an amalgamation of several filtered datasets, the total number of rows for all datasets was 4,731,600 which was reduced to 2,141,146 rows after filtering. The dataset was collected to fine a pretraind model, the model forced a number of contrains on us discussed in the following section.", + "Volume": "2,410,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "riotu-lab", + "Derived From": "InstAr-500k, Bactrain-X, xquad, xtreme", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering, instruction tuning", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabicsa.json b/datasets/arabicsa.json new file mode 100644 index 0000000..df4fc28 --- /dev/null +++ b/datasets/arabicsa.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicSA", + "Subsets": [], + "HF Link": "nan", + "Link": "https://www.saifmohammad.com/WebPages/ArabicSA.html", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "Arabic Emoticon Lexicon: Contains Arabic words and phrases frequently used on Twitter with sentiment annotation based on co-occurrence with positive and negative seed words. Arabic Hashtag Lexicon: Annotated with sentiment based on commonly used Arabic hashtags from Twitter. Arabic Hashtag Lexicon (dialectal): Focuses on dialectal Arabic words. BBN Blog Posts Sentiment Corpus: Contains Levantine dialect social media posts, manually annotated for sentiment. Syria Tweets Sentiment Corpus: A collection of tweets from Syria, manually annotated for sentiment.", + "Volume": "231,155", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "National Research Council Canada (NRC)", + "Derived From": " Twitter API, BBN Arabic-Dialect\u2013English Parallel Text", + "Paper Title": "Sentiment Lexicons for Arabic Social Media", + "Paper Link": "https://aclanthology.org/L16-1006.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, cross-lingual sentiment analysis", + "Venue Title": "LREC (Language Resources and Evaluation Conference), NAACL", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": " Language Resources and Evaluation Conference (LREC), North American Chapter of the Association for Computational Linguistics (NAACL)", + "Authors": "Saif M. Mohammad\nMohammad Salameh\nSvetlana Kiritchenko", + "Affiliations": "National Research Council Canada (NRC), University of Alberta", + "Abstract": "nan", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arabicsenamticsimilaritydataset.json b/datasets/arabicsenamticsimilaritydataset.json new file mode 100644 index 0000000..85423c0 --- /dev/null +++ b/datasets/arabicsenamticsimilaritydataset.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicSenamticSimilarityDataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/ArimaKn/ArabicSemanticSimilairtyDataset", + "Link": "https://hf.co/datasets/ArimaKn/ArabicSemanticSimilairtyDataset", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "this is a simple dataset for the semantic similarity task with 3 different judges", + "Volume": "778", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "birzeit university", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, sentiment analysis, topic classification, semantic similarty", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "mahmoud nobani", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "mahmoud nobani" +} \ No newline at end of file diff --git a/datasets/arabicweb16.json b/datasets/arabicweb16.json new file mode 100644 index 0000000..4a05fe1 --- /dev/null +++ b/datasets/arabicweb16.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicWeb16", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sites.google.com/view/arabicweb16/", + "License": "CC BY 3.0", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": " public Web crawl of 150,211,934 Arabic Web pages with high coverage of dialectal Arabic as well as Modern Standard Arabic (MSA)", + "Volume": "150,211,934", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "ArabicWeb16: A New Crawl for Today\u2019s Arabic Web\r", + "Paper Link": "https://www.ischool.utexas.edu/~ml/papers/sigir16-arabicweb.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "google sites", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "SIGIR", + "Citations": "12.0", + "Venue Type": "conference", + "Venue Name": "ACM SIGIR Conference on Research and Development in Information Retrieval", + "Authors": "Reem Suwaileh,Mucahid Kutlu,Nihal Fathima,T. Elsayed,Matthew Lease", + "Affiliations": ",TOBB University of Economics and Technology,,,", + "Abstract": "Web crawls provide valuable snapshots of the Web which enable a wide variety of research, be it distributional analysis to characterize Web properties or use of language, content analysis in social science, or Information Retrieval (IR) research to develop and evaluate effective search algorithms. While many English-centric Web crawls exist, existing public Arabic Web crawls are quite limited, limiting research and development. To remedy this, we present ArabicWeb16, a new public Web crawl of roughly 150M Arabic Web pages with significant coverage of dialectal Arabic as well as Modern Standard Arabic. For IR researchers, we expect ArabicWeb16 to support various research areas: ad-hoc search, question answering, filtering, cross-dialect search, dialect detection, entity search, blog search, and spam detection. Combined use with a separate Arabic Twitter dataset we are also collecting may provide further value.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arabicweb24.json b/datasets/arabicweb24.json new file mode 100644 index 0000000..4ce03dd --- /dev/null +++ b/datasets/arabicweb24.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabicWeb24", + "Subsets": [], + "HF Link": "https://hf.co/datasets/lightonai/ArabicWeb24", + "Link": "https://hf.co/datasets/lightonai/ArabicWeb24", + "License": "odc-by", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "The ArabicWeb24 dataset consists of more than 28 billion tokens of cleaned and deduplicated Arabic web data from a customized crawl. This was processed using the large scale data processing library datatrove.", + "Volume": "28,000,000,000", + "Unit": "tokens", + "Ethical Risks": "High", + "Provider": "lightonai", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arablex__database_of_arabic_general_vocabulary_(dag).json b/datasets/arablex__database_of_arabic_general_vocabulary_(dag).json new file mode 100644 index 0000000..f80ab6b --- /dev/null +++ b/datasets/arablex__database_of_arabic_general_vocabulary_(dag).json @@ -0,0 +1,36 @@ +{ + "Name": "ArabLEX: Database of Arabic General Vocabulary (DAG)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-L0131/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "A comprehensive full-form lexicon of Arabic general vocabulary including all inflected, conjugated and cliticized forms. Each entry is accompanied by a rich set of morphological, grammatical, and phonological attributes. Ideally suited for NLP applications, DAG provides precise phonemic transcriptions and full vowel diacritics designed to enhance Arabic speech technology.", + "Volume": "87,930,738", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "ELRA ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "42,000.00\u20ac", + "Test Split": "No", + "Tasks": "morphological analysis, phonological analysis, grammatical analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arabscribe.json b/datasets/arabscribe.json new file mode 100644 index 0000000..d09ecb4 --- /dev/null +++ b/datasets/arabscribe.json @@ -0,0 +1,36 @@ +{ + "Name": "ArabScribe", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/arabscribe/", + "License": "custom", + "Year": 2017, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The ArabScribe dataset contains 10,000 transcriptions of Arabic words with both Roman and Arabic keyboards based on audio impressions of native and non-native speakers of Arabic.", + "Volume": "3,234", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "Robust Dictionary Lookup in Multiple Noisy Orthographies", + "Paper Link": "https://aclanthology.org/W17-1315.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dictionary", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Lingliang Zhang, Nizar Habash and Godfried Toussaint", + "Affiliations": "NYU", + "Abstract": "We present the MultiScript Phonetic\nSearch algorithm to address the problem\nof language learners looking up unfamiliar words that they heard. We apply it\nto Arabic dictionary lookup with noisy\nqueries done using both the Arabic and\nRoman scripts. Our algorithm is based on\na computational phonetic distance metric\nthat can be optionally machine learned. To\nbenchmark our performance, we created\nthe ArabScribe dataset, containing 10,000\nnoisy transcriptions of random Arabic dictionary words. Our algorithm outperforms\nGoogle Translate\u2019s \u201cdid you mean\" feature, as well as the Yamli smart Arabic\nkeyboard.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/aracovid19-mfh.json b/datasets/aracovid19-mfh.json new file mode 100644 index 0000000..5d52a59 --- /dev/null +++ b/datasets/aracovid19-mfh.json @@ -0,0 +1,36 @@ +{ + "Name": "AraCOVID19-MFH", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/MohamedHadjAmeur/AraCOVID19-MFH", + "License": "CC BY-NC-SA 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "multi-label fake news and hate speech detection dataset each sentence is annotated with 10 labels", + "Volume": "10,828", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "AraCOVID19-MFH: Arabic COVID-19 Multi-label Fake News and Hate Speech Detection Dataset", + "Paper Link": "https://arxiv.org/abs/2105.03143", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "fake news detection, hate speech detection", + "Venue Title": "ArXiv", + "Citations": "0.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Mohamed Seghir Hadj Ameur,H. Aliane", + "Affiliations": ",", + "Abstract": "Along with the COVID-19 pandemic, an \"infodemic\" of false and misleading information has emerged and has complicated the COVID-19 response efforts. Social networking sites such as Facebook and Twitter have contributed largely to the spread of rumors, conspiracy theories, hate, xenophobia, racism, and prejudice. To combat the spread of fake news, researchers around the world have and are still making considerable efforts to build and share COVID-19 related research articles, models, and datasets. This paper releases \"AraCOVID19-MFH\"1a manually annotated multi-label Arabic COVID-19 fake news and hate speech detection dataset. Our dataset contains 10,828 Arabic tweets annotated with 10 different labels. The labels have been designed to consider some aspects relevant to the fact-checking task, such as the tweet's check worthiness, positivity/negativity, and factuality. To confirm our annotated dataset's practical utility, we used it to train and evaluate several classification models and reported the obtained results. Though the dataset is mainly designed for fake news detection, it can also be used for hate speech detection, opinion/news classification, dialect identification, and many other tasks. \u00a9 2021 Elsevier B.V.. All rights reserved.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json b/datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json new file mode 100644 index 0000000..70a34c0 --- /dev/null +++ b/datasets/aracovid19-ssd__arabic_covid-19_sentiment_and_sarcasm_detection_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "AraCovid19-SSD: Arabic COVID-19 Sentiment and Sarcasm Detection Dataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/MohamedHadjAmeur/AraCovid19-SSD", + "License": "CC BY-NC-SA 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "AraCovid19-SSD is a manually annotated Arabic COVID-19 sarcasm and sentiment detection dataset containing 5,162 tweets.", + "Volume": "5,162", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Research Centre on Scientific and Technical Information (CERIST)", + "Derived From": "nan", + "Paper Title": "ARACOVID19-SSD: ARABIC COVID-19 SENTIMENT AND SARCASM DETECTION DATASET", + "Paper Link": "https://arxiv.org/pdf/2110.01948v1.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sarcasm detection, sentiment detection", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Mohamed Seghir Hadj Ameur, Hassina Aliane", + "Affiliations": "Research Centre on Scientific and Technical Information (CERIST)", + "Abstract": "Coronavirus disease (COVID-19) is an infectious respiratory disease that was first discovered in late December 2019, in Wuhan, China, and then spread worldwide causing a lot of panic and death. Users of social networking sites such as Facebook and Twitter have been focused on reading, publishing, and sharing novelties, tweets, and articles regarding the newly emerging pandemic. A lot of these users often employ sarcasm to convey their intended meaning in a humorous, funny, and indirect way making it hard for computer-based applications to automatically understand and identify their goal and the harm level that they can inflect. Motivated by the emerging need for annotated datasets that tackle these kinds of problems in the context of COVID-19, this paper builds and releases AraCOVID19-SSD1 a manually annotated Arabic COVID-19 sarcasm and sentiment detection dataset containing 5,162 tweets. To confirm the practical utility of the built dataset, it has been carefully analyzed and tested using several classification models.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/aracust.json b/datasets/aracust.json new file mode 100644 index 0000000..4616756 --- /dev/null +++ b/datasets/aracust.json @@ -0,0 +1,36 @@ +{ + "Name": "AraCust", + "Subsets": [], + "HF Link": "nan", + "Link": "https://peerj.com/articles/cs-510/#supplemental-information", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Saudi Telecom Tweets corpus for sentiment analysis", + "Volume": "20,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Durham University,Princess Nourah bint Abdulrahman University", + "Derived From": "nan", + "Paper Title": "AraCust: a Saudi Telecom Tweets corpus for sentiment analysis", + "Paper Link": "https://peerj.com/articles/cs-510/#supplemental-information", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "PeerJ Comput. Sci.", + "Citations": "0.0", + "Venue Type": "journal", + "Venue Name": "The open access journal for computer science", + "Authors": "Latifah Almuqren,A. Cristea", + "Affiliations": ",", + "Abstract": "Comparing Arabic to other languages, Arabic lacks large corpora for Natural Language Processing (Assiri, Emam & Al-Dossari, 2018; Gamal et al., 2019). A number of scholars depended on translation from one language to another to construct their corpus (Rushdi-Saleh et al., 2011). This paper presents how we have constructed, cleaned, pre-processed, and annotated our 20,0000 Gold Standard Corpus (GSC) AraCust, the first Telecom GSC for Arabic Sentiment Analysis (ASA) for Dialectal Arabic (DA). AraCust contains Saudi dialect tweets, processed from a self-collected Arabic tweets dataset and has been annotated for sentiment analysis, i.e.,manually labelled (k=0.60). In addition, we have illustrated AraCust\u2019s power, by performing an exploratory data analysis, to analyse the features that were sourced from the nature of our corpus, to assist with choosing the right ASA methods for it. To evaluate our Golden Standard corpus AraCust, we have first applied a simple experiment, using a supervised classifier, to offer benchmark outcomes for forthcoming works. In addition, we have applied the same supervised classifier on a publicly available Arabic dataset created from Twitter, ASTD (Nabil, Aly & Atiya, 2015). The result shows that our dataset AraCust outperforms the ASTD result with 91% accuracy and 89% F1avg score. The AraCust corpus will be released, together with code useful for its exploration, via GitHub as a part of this submission.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/aradata.json b/datasets/aradata.json new file mode 100644 index 0000000..c0165d0 --- /dev/null +++ b/datasets/aradata.json @@ -0,0 +1,79 @@ +{ + "Name": "araData", + "Subsets": [ + { + "Name": "GLF", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "2,007", + "Unit": "nan" + }, + { + "Name": "General", + "Dialect": "mixed", + "Volume": "2,003", + "Unit": "nan" + }, + { + "Name": "EGY", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "2,002", + "Unit": "nan" + }, + { + "Name": "LEV", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "2,001", + "Unit": "nan" + }, + { + "Name": "MGH", + "Dialect": "ar-NOR: (Arabic (North Africa))", + "Volume": "2,001", + "Unit": "nan" + }, + { + "Name": "Tunisian", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "2,001", + "Unit": "nan" + }, + { + "Name": "IRQ", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "216", + "Unit": "nan" + } + ], + "HF Link": "https://hf.co/datasets/arbml/arData", + "Link": "https://github.com/malek-hedhli/araData", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "araData is a clean and balanced dataset containing sentences for 7 Arabic dialects represented as follows: 2007 GLF, 2003 general, 2002 EGY, 2001 LEV, 2001 MGH, 2001 Tunisian, and 216 IRQ. This data is a collection of different resources prepared by HEDHLI Malek for a project to identify Arabic dialects.", + "Volume": "12,236", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arafacts.json b/datasets/arafacts.json new file mode 100644 index 0000000..8428f0f --- /dev/null +++ b/datasets/arafacts.json @@ -0,0 +1,36 @@ +{ + "Name": "AraFacts", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AraFacts", + "Link": "https://gitlab.com/bigirqu/AraFacts/", + "License": "CC BY-NC 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "consists of 6,222 claims\r\nalong with their factual labels and additional\r\nmetadata, such as fact-checking article content,\r\ntopical category, and links to posts or Web\r\npages spreading the claim", + "Volume": "6,222", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "AraFacts: The First Large Arabic Dataset of Naturally Occurring Claims", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.26.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification, claim verification", + "Venue Title": "WANLP", + "Citations": "0.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Zien Sheikh Ali,Watheq Mansour,Tamer Elsayed,A. Al-Ali", + "Affiliations": ",,,", + "Abstract": "We introduce AraFacts, the first large Arabic dataset of naturally occurring claims collected from 5 Arabic fact-checking websites, e.g., Fatabyyano and Misbar, and covering claims since 2016. Our dataset consists of 6,121 claims along with their factual labels and additional metadata, such as fact-checking article content, topical category, and links to posts or Web pages spreading the claim. Since the data is obtained from various fact-checking websites, we standardize the original claim labels to provide a unified label rating for all claims. Moreover, we provide revealing dataset statistics and motivate its use by suggesting possible research applications. The dataset is made publicly available for the research community.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/aranews.json b/datasets/aranews.json new file mode 100644 index 0000000..8785771 --- /dev/null +++ b/datasets/aranews.json @@ -0,0 +1,36 @@ +{ + "Name": "AraNews", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/UBC-NLP/wanlp2020_arabic_fake_news_detection", + "License": "custom", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a large-scale, multi-topic, and multi-country Arabic news dataset", + "Volume": "1,000,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "UBC", + "Derived From": "nan", + "Paper Title": "Machine Generation and Detection of Arabic Manipulated and Fake News\r", + "Paper Link": "https://aclanthology.org/2020.wanlp-1.7.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "fake news detection", + "Venue Title": "WANLP", + "Citations": "6.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "E. Nagoudi,AbdelRahim Elmadany,Muhammad Abdul-Mageed,Tariq Alhindi,H. Cavusoglu", + "Affiliations": ",University of British Columbia,,Columbia University;King Abdulaziz City for Science and Technology,", + "Abstract": "Fake news and deceptive machine-generated text are serious problems threatening modern societies, including in the Arab world. This motivates work on detecting false and manipulated stories online. However, a bottleneck for this research is lack of sufficient data to train detection models. We present a novel method for automatically generating Arabic manipulated (and potentially fake) news stories. Our method is simple and only depends on availability of true stories, which are abundant online, and a part of speech tagger (POS). To facilitate future work, we dispense with both of these requirements altogether by providing AraNews, a novel and large POS-tagged news dataset that can be used off-the-shelf. Using stories generated based on AraNews, we carry out a human annotation study that casts light on the effects of machine manipulation on text veracity. The study also measures human ability to detect Arabic machine manipulated text generated by our method. Finally, we develop the first models for detecting manipulated Arabic news and achieve state-of-the-art results on Arabic fake news detection (macro F1=70.06). Our models and data are publicly available.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/aranpcc.json b/datasets/aranpcc.json new file mode 100644 index 0000000..f6eaae9 --- /dev/null +++ b/datasets/aranpcc.json @@ -0,0 +1,36 @@ +{ + "Name": "AraNPCC", + "Subsets": [], + "HF Link": "nan", + "Link": "https://archive.org/details/AraNPCC", + "License": "CC BY-NC 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "the corpus is a large Arabic newspaper COVID-19 corpus, automatically collected from 88 Arabic newspapers from 12 Arab countries.", + "Volume": "7,277,525", + "Unit": "documents", + "Ethical Risks": "Medium", + "Provider": "KACST", + "Derived From": "nan", + "Paper Title": "AraNPCC: The Arabic Newspaper COVID-19 Corpus", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2022/workshops/OSACT/pdf/2022.osact-1.4.pdf", + "Script": "Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, topic classification, corpus spatio-temporal analysis", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "European Language Resources Association (ELRA)", + "Authors": "Abdulmohsen Al-Thubaity, Sakhar Alkhereyf, Alia Bahanshal", + "Affiliations": "The National Center for Data Analytics and Artificial Intelligence King Abdulaziz City for Science and Technology (KACST)", + "Abstract": "This paper introduces a corpus for Arabic newspapers during COVID-19: AraNPCC. The AraNPCC corpus covers 2019 until 2021 via automatically-collected data from 12 Arab countries. It comprises more than 2 billion words and 7.2 million texts alongside their metadata. AraNPCC can be used for several natural language processing tasks, such as updating available Arabic language models or corpus linguistics tasks, including language change over time. We utilized the corpus in two case studies. In the first case study, we investigate the correlation between the number of officially reported infected cases and the collective word frequency of \u201cCOVID\u201d and \u201cCorona.\u201d The data shows a positive correlation that varies among Arab countries. For the second case study, we extract and compare the top 50 keywords in 2020 and 2021 to study the impact of the COVID-19 pandemic on two Arab countries, namely Algeria and Saudi Arabia. For 2020, the data shows that the two countries\u2019 newspapers strongly interacted with the pandemic, emphasizing its spread and dangerousness, and in 2021 the data suggests that the two countries coped with the pandemic.", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file diff --git a/datasets/arap-tweet_corpus.json b/datasets/arap-tweet_corpus.json new file mode 100644 index 0000000..2cf7dab --- /dev/null +++ b/datasets/arap-tweet_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Arap-Tweet Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://arap.qatar.cmu.edu/ ", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arap-Tweet is a large-scale, multi-dialectal Arabic Twitter corpus containing 2.4 million tweets from 11 regions across 16 countries in the Arab world. The dataset includes annotations for dialect, age group, and gender of the users. Tweets were collected using region-specific keywords to ensure accurate dialect identification, and the profiles of users were manually verified and annotated by trained annotators. This corpus is designed to support research in author profiling, stylometry, and dialect identification, providing a valuable resource for natural language processing applications in Arabic.", + "Volume": "2,400,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "Public Twitter profiles and their tweets", + "Paper Title": "Arap-Tweet: A Large Multi-Dialect Twitter Corpus for Gender, Age, and Language Variety Identification", + "Paper Link": "https://arxiv.org/pdf/1808.07674", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, dialect identification, author profiling, authorship attribution", + "Venue Title": "COLING", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International academic conference on computational linguistics", + "Authors": "Wajdi Zaghouani, and Anis Charfi ", + "Affiliations": "College of Humanities and Social Sciences, Hamad Bin Khalifa University, Qatar; Information Systems Program, Carnegie Mellon University, Qatar", + "Abstract": "This paper presents the Arap-Tweet corpus, a large-scale, multi-dialectal Arabic corpus sourced from Twitter, which has been annotated for age, gender, and dialectal variety. The corpus is intended to provide resources for developing NLP tools and models for Arabic dialects and can be used in tasks such as author profiling, sentiment analysis, and more. The dataset covers 11 major dialect regions and includes over 2.4 million tweets. Annotators identified users based on dialect-specific keywords and verified additional metadata (age and gender) through manual checks and external resources.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arasencorpus.json b/datasets/arasencorpus.json new file mode 100644 index 0000000..ee8dc4b --- /dev/null +++ b/datasets/arasencorpus.json @@ -0,0 +1,36 @@ +{ + "Name": "AraSenCorpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AraSenCorpus", + "Link": "https://github.com/yemen2016/AraSenCorpus", + "License": "MIT License", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " contains 4.5 million tweets and covers both modern standard Arabic and some of the Arabic dialects", + "Volume": "4,500,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "AraSenCorpus: A Semi-Supervised Approach for Sentiment Annotation of a Large Arabic Text Corpus", + "Paper Link": "https://github.com/yemen2016/AraSenCorpus", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "applsci", + "Citations": "2.0", + "Venue Type": "journal", + "Venue Name": "Applied Sciences Journal", + "Authors": "Ali Al-Laith,Muhammad Shahbaz,Hind Alaskar,Asim Rehmat", + "Affiliations": ",,,", + "Abstract": "At a time when research in the field of sentiment analysis tends to study advanced topics in languages, such as English, other languages such as Arabic still suffer from basic problems and challenges, most notably the availability of large corpora. Furthermore, manual annotation is time-consuming and difficult when the corpus is too large. This paper presents a semi-supervised self-learning technique, to extend an Arabic sentiment annotated corpus with unlabeled data, named AraSenCorpus. We use a neural network to train a set of models on a manually labeled dataset containing 15,000 tweets. We used these models to extend the corpus to a large Arabic sentiment corpus called \u201cAraSenCorpus\u201d. AraSenCorpus contains 4.5 million tweets and covers both modern standard Arabic and some of the Arabic dialects. The long-short term memory (LSTM) deep learning classifier is used to train and test the final corpus. We evaluate our proposed framework on two external benchmark datasets to ensure the improvement of the Arabic sentiment classification. The experimental results show that our corpus outperforms the existing state-of-the-art systems.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arasenti.json b/datasets/arasenti.json new file mode 100644 index 0000000..fd9043a --- /dev/null +++ b/datasets/arasenti.json @@ -0,0 +1,36 @@ +{ + "Name": "AraSenti", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AraSenti", + "Link": "https://github.com/nora-twairesh/AraSenti", + "License": "GPL-3.0", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The corpus contains 17,573 tweets labelled with four labels for sentiment: positive, negative, neutral and mixed", + "Volume": "17,573", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "KSU", + "Derived From": "nan", + "Paper Title": "AraSenTi-Tweet: A Corpus for Arabic Sentiment Analysis of Saudi Tweets", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S1877050917321518", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "ACLING", + "Citations": "56.0", + "Venue Type": "conference", + "Venue Name": "nternational Conference on AI in Computational Linguistics", + "Authors": "Nora Al-Twairesh,Hend Suliman Al-Khalifa,A. Al-Salman,Y. Al-Ohali", + "Affiliations": ",,,", + "Abstract": "Abstract Arabic Sentiment Analysis is an active research area these days. However, the Arabic language still lacks sufficient language resources to enable the tasks of sentiment analysis. In this paper, we present the details of collecting and constructing a large dataset of Arabic tweets. The techniques used in cleaning and pre-processing the collected dataset are explained. A corpus of Arabic tweets annotated for sentiment analysis was extracted from this dataset. The corpus consists mainly of tweets written in Modern Standard Arabic and the Saudi dialect. The corpus was manually annotated for sentiment. The annotation process is explained in detail and the challenges during the annotation are highlighted. The corpus contains 17,573 tweets labelled with four labels for sentiment: positive, negative, neutral and mixed. Baseline experiments were conducted to provide benchmark results for future work.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/araspider.json b/datasets/araspider.json new file mode 100644 index 0000000..1e23ec7 --- /dev/null +++ b/datasets/araspider.json @@ -0,0 +1,36 @@ +{ + "Name": "ARASPIDER", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/ahmedheakl/AraSpider", + "License": "MIT License", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "AraSpider is a translated version of the Spider dataset, which is commonly used for semantic parsing and text-to-SQL generation. The dataset includes 200 databases across 138 domains with 10,181 questions and 5,693 unique complex SQL queries. This study assesses multilingual models for Arabic text-to-SQL tasks and uses techniques like back translation and prompt engineering to improve the models' accuracy in handling Arabic inputs.", + "Volume": "10,181", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "Spider dataset (originally in English)", + "Paper Title": "ARASPIDER: Democratizing Arabic-to-SQL", + "Paper Link": "https://arxiv.org/pdf/2402.07448", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "semantic parsing, SQL generation from natural language inputs, text-to-SQL conversion for Arabic-speaking users.", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Ahmed Heakl, Youssef Mohamed, Ahmed B. Zaky", + "Affiliations": "Egypt-Japan University of Science and Technology (E-JUST)", + "Abstract": "The AraSpider dataset is an Arabic version of the Spider dataset, designed for text-to-SQL tasks in natural language processing. Four multilingual translation models were evaluated for translating the Spider dataset from English to Arabic, and two models were tested for their ability to generate SQL queries from Arabic text. The study highlights the effectiveness of back translation strategies, and proposes methodologies for democratizing NLP resources and enhancing collaboration within the Arabic-speaking research community.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arastance.json b/datasets/arastance.json new file mode 100644 index 0000000..5e7e6bb --- /dev/null +++ b/datasets/arastance.json @@ -0,0 +1,36 @@ +{ + "Name": "AraStance", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/arastance", + "Link": "https://github.com/Tariq60/arastance", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "covers false and true claims from multiple domains (e.g., politics, sports, health) and several Arab countries", + "Volume": "4,063", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "AraStance: A Multi-Country and Multi-Domain Dataset of\r\nArabic Stance Detection for Fact Checking", + "Paper Link": "https://arxiv.org/pdf/2104.13559.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "stance detection", + "Venue Title": "NLP4IF", + "Citations": "0.0", + "Venue Type": "workshop", + "Venue Name": "NLP for Internet Freedom", + "Authors": "Tariq Alhindi,Amal Alabdulkarim,A. Alshehri,Muhammad Abdul-Mageed,Preslav Nakov", + "Affiliations": "Columbia University;King Abdulaziz City for Science and Technology,,,,", + "Abstract": "With the continuing spread of misinformation and disinformation online, it is of increasing importance to develop combating mechanisms at scale in the form of automated systems that support multiple languages. One task of interest is claim veracity prediction, which can be addressed using stance detection with respect to relevant documents retrieved online. To this end, we present our new Arabic Stance Detection dataset (AraStance) of 4,063 claim\u2013article pairs from a diverse set of sources comprising three fact-checking websites and one news website. AraStance covers false and true claims from multiple domains (e.g., politics, sports, health) and several Arab countries, and it is well-balanced between related and unrelated documents with respect to the claims. We benchmark AraStance, along with two other stance detection datasets, using a number of BERT-based models. Our best model achieves an accuracy of 85% and a macro F1 score of 78%, which leaves room for improvement and reflects the challenging nature of AraStance and the task of stance detection in general.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arasum_corpus.json b/datasets/arasum_corpus.json new file mode 100644 index 0000000..8f3ed3c --- /dev/null +++ b/datasets/arasum_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "AraSum Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AraSum", + "Link": "https://github.com/ppke-nlpg/AraSum", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "AraSum corpus is a monolingual corpus for abstractive text summarization for the Arabic language. The AraSum corpus contains 49604 articles and their corresponding leads. AraSum files are in .csv format.", + "Volume": "49,604", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "P\u00e1zm\u00e1ny P\u00e9ter Catholic University Faculty of Information Technology and Bionics", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "summarization", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mram Kahla, Zijian Gy\u0151z\u0151 Yang, Attila Nov\u00e1k", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Mram Kahla" +} \ No newline at end of file diff --git a/datasets/arbanking77.json b/datasets/arbanking77.json new file mode 100644 index 0000000..b2a1861 --- /dev/null +++ b/datasets/arbanking77.json @@ -0,0 +1,36 @@ +{ + "Name": "ArBanking77", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/arbanking77/", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "Queries in MSA and Palestinian Arabic annotated for their intent (77 classes)", + "Volume": "31,404", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "ArBanking77: Intent Detection Neural Model and a New Dataset in Modern and Dialectical Arabic", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.22/", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification, intent classification", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "Mustafa Jarrar, Ahmet Birim, Mohammed Khalilia, Mustafa Erden, Sana Ghanem", + "Affiliations": "nan", + "Abstract": "This paper presents the ArBanking77, a large Arabic dataset for intent detection in the banking domain. Our dataset was arabized and localized from the original English Banking77 dataset, which consists of 13,083 queries to ArBanking77 dataset with 31,404 queries in both Modern Standard Arabic (MSA) and Palestinian dialect, with each query classified into one of the 77 classes (intents). Furthermore, we present a neural model, based on AraBERT, fine-tuned on ArBanking77, which achieved an F1-score of 0.9209 and 0.8995 on MSA and Palestinian dialect, respectively. We performed extensive experimentation in which we simulated low-resource settings, where the model is trained on a subset of the data and augmented with noisy queries to simulate colloquial terms, mistakes and misspellings found in real NLP systems, especially live chat queries. The data and the models are publicly available at https://sina.birzeit.edu/arbanking77.", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/arc-wmi.json b/datasets/arc-wmi.json new file mode 100644 index 0000000..f94ac3d --- /dev/null +++ b/datasets/arc-wmi.json @@ -0,0 +1,36 @@ +{ + "Name": "ARC-WMI", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ARC_WMI", + "Link": "https://github.com/iwan-rg/ARC-WMI", + "License": "CC BY-NC-SA 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " 4476 sentences with over 61k words, extracted from 94 sources of Arabic written medicine information", + "Volume": "4,476", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "King Saud University Riyadh", + "Derived From": "nan", + "Paper Title": "ARC-WMI: Towards Building Arabic Readability Corpus for Written Medicine\r\nInformation", + "Paper Link": "http://lrec-conf.org/workshops/lrec2018/W30/pdf/9_W30.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "readability assessment", + "Venue Title": "LREC", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Abeer Aldayel,H. Al-Khalifa,S. Alaqeel,N. Abanmy,Maha Al-Yahya,Mona T. Diab", + "Affiliations": ",,,,,", + "Abstract": "Developing easy-to-read written medicine information continues to be a challenge in health communication. Readability aims to gauge the difficulty level of a text. Various formulas and machine learning algorithms have proposed to judge the readability of health materials and assist writers in identifying possible problems related to text difficulty. For this reason, having corpus annotated with readability levels is fundamental to evaluating the readability formulas and training machine learning algorithms. Arabic suffers from a lack of annotated corpora to evaluate text readability, especially for health materials. To address this shortage, we describe a baseline results towards constructing readability corpus ARC-WMI, a new Arabic collection of written medicine information annotated with readability levels. We compiled a corpus of 4476 sentences with over 61k words, extracted from 94 sources of Arabic written medicine information. These sentences were manually annotated and assigned a readability level (\u201cEasy,\u201d \u201cIntermediate,\u201d or \u201cDifficult\u201d) by a panel of five health-care professionals.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arcd.json b/datasets/arcd.json new file mode 100644 index 0000000..e9dbd07 --- /dev/null +++ b/datasets/arcd.json @@ -0,0 +1,36 @@ +{ + "Name": "ARCD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/hsseinmz/arcd", + "Link": "https://github.com/husseinmozannar/SOQAL", + "License": "MIT License", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "1,395 questions posed by crowdworkers on Wikipedia articles", + "Volume": "1,395", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "AUB", + "Derived From": "Includes translation of SQuAD version 1.1 ", + "Paper Title": "Neural Arabic Question Answering\r", + "Paper Link": "https://arxiv.org/pdf/1906.05394.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "WANLP", + "Citations": "29.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Hussein Mozannar,Karl El Hajal,Elie Maamary,Hazem M. Hajj", + "Affiliations": ",,,", + "Abstract": "This paper tackles the problem of open domain factual Arabic question answering (QA) using Wikipedia as our knowledge source. This constrains the answer of any question to be a span of text in Wikipedia. Open domain QA for Arabic entails three challenges: annotated QA datasets in Arabic, large scale efficient information retrieval and machine reading comprehension. To deal with the lack of Arabic QA datasets we present the Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles, and a machine translation of the Stanford Question Answering Dataset (Arabic-SQuAD). Our system for open domain question answering in Arabic (SOQAL) is based on two components: (1) a document retriever using a hierarchical TF-IDF approach and (2) a neural reading comprehension model using the pre-trained bi-directional transformer BERT. Our experiments on ARCD indicate the effectiveness of our approach with our BERT-based reader achieving a 61.3 F1 score, and our open domain system SOQAL achieving a 27.6 F1 score.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json b/datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json new file mode 100644 index 0000000..1e396bc --- /dev/null +++ b/datasets/arcorona__analyzing_arabic_tweets_in_the_early_days_of_coronavirus_(covid-19)_pandemic.json @@ -0,0 +1,36 @@ +{ + "Name": "ArCorona: Analyzing Arabic Tweets in the Early Days of Coronavirus (COVID-19) Pandemic", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArCorona", + "Link": "https://alt.qcri.org/resources/ArCorona.tsv", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Collected to prevent\r\nspreading of rumors and misinformation about\r\nthe virus or bad cures", + "Volume": "8,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "ArCorona: Analyzing Arabic Tweets in the Early Days of Coronavirus (COVID-19) Pandemic", + "Paper Link": "https://arxiv.org/abs/2012.01462", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "COVID misinformation detection", + "Venue Title": "arXiv", + "Citations": "7.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Hamdy Mubarak, Sabit Hassan", + "Affiliations": "Qatar Computing Research Institute", + "Abstract": "Over the past few months, there were huge numbers of circulating tweets and discussions about Coronavirus (COVID-19) in the Arab region. It is important for policy makers and many people to identify types of shared tweets to better understand public behavior, topics of interest, requests from governments, sources of tweets, etc. It is also crucial to prevent spreading of rumors and misinformation about the virus or bad cures. To this end, we present the largest manually annotated dataset of Arabic tweets related to COVID-19. We describe annotation guidelines, analyze our dataset and build effective machine learning and transformer based models for classification.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/arcov-19.json b/datasets/arcov-19.json new file mode 100644 index 0000000..b5c60ef --- /dev/null +++ b/datasets/arcov-19.json @@ -0,0 +1,36 @@ +{ + "Name": "ArCOV-19", + "Subsets": [], + "HF Link": "https://hf.co/datasets/bigIR/ar_cov19", + "Link": "https://gitlab.com/bigirqu/ArCOV-19", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arabic COVID-19 Twitter dataset that covers the period from 27th of January till 5th of May 2021.", + "Volume": "3,140,158", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "ArCOV-19: The First Arabic COVID-19 Twitter Dataset\r\nwith Propagation Networks", + "Paper Link": "https://camel.abudhabi.nyu.edu/WANLP-2021-Program/47_Paper.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "information retrieval,social computing", + "Venue Title": "WANLP", + "Citations": "18.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Fatima Haouari,Maram Hasanain,Reem Suwaileh,T. Elsayed", + "Affiliations": ",,,", + "Abstract": "In this paper, we present ArCOV-19, an Arabic COVID-19 Twitter dataset that spans one year, covering the period from 27th of January 2020 till 31st of January 2021. ArCOV-19 is the first publicly-available Arabic Twitter dataset covering COVID-19 pandemic that includes about 2.7M tweets alongside the propagation networks of the most-popular subset of them (i.e., most-retweeted and -liked). The propagation networks include both retweetsand conversational threads (i.e., threads of replies). ArCOV-19 is designed to enable research under several domains including natural language processing, information retrieval, and social computing. Preliminary analysis shows that ArCOV-19 captures rising discussions associated with the first reported cases of the disease as they appeared in the Arab world.In addition to the source tweets and the propagation networks, we also release the search queries and the language-independent crawler used to collect the tweets to encourage the curation of similar datasets.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arcov19-rumors.json b/datasets/arcov19-rumors.json new file mode 100644 index 0000000..dee3ea3 --- /dev/null +++ b/datasets/arcov19-rumors.json @@ -0,0 +1,36 @@ +{ + "Name": "ArCOV19-Rumors", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArCOV19_claims", + "Link": "https://gitlab.com/bigirqu/ArCOV-19/-/tree/master/ArCOV19-Rumors", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The first Arabic dataset for rumors verification in Twitter", + "Volume": "9,414", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": " Qatar University\r", + "Derived From": "nan", + "Paper Title": "ArCOV19-Rumors: Arabic COVID-19 Twitter Dataset for Misinformation Detection", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.8.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "fact checking", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Fatima Haouari" +} \ No newline at end of file diff --git a/datasets/arcovidvac.json b/datasets/arcovidvac.json new file mode 100644 index 0000000..5c4ce6f --- /dev/null +++ b/datasets/arcovidvac.json @@ -0,0 +1,36 @@ +{ + "Name": "ArCovidVac", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArCovidVac", + "Link": "https://alt.qcri.org/resources/ArCovidVac.zip", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "the largest manually annotated Arabic tweet dataset, ArCovidVac, for the COVID-19 vaccination campaign, covering many countries in the Arab region", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "ArCovidVac: Analyzing Arabic Tweets About COVID-19 Vaccination", + "Paper Link": "https://arxiv.org/pdf/2201.06496.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "informativeness, text classification, stance detection", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Hamdy Mubarak, Sabit Hassan, Shammur Absar Chowdhury, Firoj Alam", + "Affiliations": "Qatar Computing Research Institute, HBKU; University of Pittsburgh;Qatar Computing Research Institute, HBKU", + "Abstract": "The emergence of the COVID-19 pandemic and the first global infodemic have changed our lives in many different ways. We\nrelied on social media to get the latest information about COVID-19 pandemic and at the same time to disseminate information.\nThe content in social media consisted not only health related advise, plans, and informative news from policymakers, but also\ncontains conspiracies and rumors. It became important to identify such information as soon as they are posted to make an\nactionable decision (e.g., debunking rumors, or taking certain measures for traveling). To address this challenge, we develop\nand publicly release the first largest manually annotated Arabic tweet dataset, ArCovidVac, for the COVID-19 vaccination\ncampaign, covering many countries in the Arab region. The dataset is enriched with different layers of annotation, including,\n(i) Informativeness (more vs. less importance of the tweets); (ii) fine-grained tweet content types (e.g., advice, rumors,\nrestriction, authenticate news/information); and (iii) stance towards vaccination (pro-vaccination, neutral, anti-vaccination).\nFurther, we performed in-depth analysis of the data, exploring the popularity of different vaccines, trending hashtags, topics\nand presence of offensiveness in the tweets. We studied the data for individual types of tweets and temporal changes in stance\ntowards vaccine. We benchmarked the ArCovidVac dataset using transformer architectures for informativeness, content types,\nand stance detection.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/areej.json b/datasets/areej.json new file mode 100644 index 0000000..b0031be --- /dev/null +++ b/datasets/areej.json @@ -0,0 +1,36 @@ +{ + "Name": "AREEj", + "Subsets": [], + "HF Link": "https://hf.co/datasets/dru-ac/ArSRED", + "Link": "https://hf.co/datasets/dru-ac/ArSRED", + "License": "CC BY-SA 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This dataset was made by adding evidence annotations to the Arabic subset of SREDFM. The dataset is from the Proceedings of The Second Arabic Natural Language Processing Conference paper AREEj: Arabic Relation Extraction with Evidence. If you use the dataset or the model, please reference this work in your paper:", + "Volume": "500,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "AREEj: Arabic Relation Extraction with Evidence", + "Paper Link": "https://aclanthology.org/2024.arabicnlp-1.6.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "relation extraction", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Relational entity extraction is key in building\nknowledge graphs. A relational entity has a\nsource, a tail and a type. In this paper, we\nconsider Arabic text and introduce evidence\nenrichment which intuitively informs models\nfor better predictions. Relational evidence is\nan expression in the text that explains how\nsources and targets relate. This paper augments the existing SREDFM relational extraction dataset with evidence annotation to its\n2.9-million Arabic relations. We leverage the\naugmented dataset to build AREEj, a relation extraction with evidence model from Arabic documents. The evidence augmentation\nmodel we constructed to complete the dataset\nachieved .82 F1-score (.93 precision, .73 recall). The target AREEj outperformed SOTA\nmREBEL with .72 F1-score (.78 precision, .66\nrecall).", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arentail.json b/datasets/arentail.json new file mode 100644 index 0000000..0b35aeb --- /dev/null +++ b/datasets/arentail.json @@ -0,0 +1,36 @@ +{ + "Name": "ArEntail", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArEntail", + "Link": "https://github.com/RashaMObeidat/ArEntail", + "License": "unknown", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arabic NLI dataset called ArEntail, consisting of 6000 sentence pairs collected from news headlines and manually labeled to indicate whether an entailment relationship links the sentences or not without resorting to machine translation from English datasets", + "Volume": "6,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Jordan University of Science and Technology", + "Derived From": "nan", + "Paper Title": "ArEntail: manually-curated Arabic natural language inference dataset from news headlines", + "Paper Link": "https://link.springer.com/article/10.1007/s10579-024-09731-1", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "natural language inference", + "Venue Title": "LRE", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": " Language Resources and Evaluation", + "Authors": "Rasha Obeidat, Yara Al-Harahsheh, Mahmoud Al-Ayyoub, Maram Gharaibeh ", + "Affiliations": "nan", + "Abstract": "Natural language inference (NLI), also known as textual entailment recognition (TER), is a crucial task in natural language processing that combines many fundamental aspects of language understanding. Despite the recent significant advancement in NLI, primarily driven by the development of diverse large-scale datasets, most of the progress has been confined to English. This is attributed to the scarcity of human-annotated corpora for most other languages, notably Arabic. In this paper, we present an Arabic NLI dataset called ArEntail, consisting of 6000 sentence pairs collected from news headlines and manually labeled to indicate whether an entailment relationship links the sentences or not without resorting to machine translation from English datasets. To our knowledge, this is the largest yet human-crafted NLI dataset for the Arabic language. We offer various data analyses and establish baseline results using state-of-the-art pre-trained models for Arabic, in addition to a human-based evaluation. Our findings revealed that AraBERT-base v2, the best-performing model, achieves an accuracy of 93%, revealing a gap of 2.6% compared to human performance and presenting a valuable opportunity for further advancements in modeling techniques in future research. Besides, the \u201chypothesis-only\u201d baseline performance baseline closely resembles a random guesser\u2019s, indicating the rarity of annotation artifacts compared to prior NLI English benchmarks. We also evaluated GPT-3.5-turbo in zero-shot and few-shot Arabic NLI learning scenarios and observed promising outcomes with a cautious approach, awaiting strong clues for predicting the presence of the entailment relationship.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arl_arabic_dependency_treebank.json b/datasets/arl_arabic_dependency_treebank.json new file mode 100644 index 0000000..d833f51 --- /dev/null +++ b/datasets/arl_arabic_dependency_treebank.json @@ -0,0 +1,36 @@ +{ + "Name": "ARL Arabic Dependency Treebank", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The source data in this release consists of Arabic newswire and broadcast programming collected by LDC from various news and broadcast providers.", + "Volume": "nan", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "Arabic Treebank", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "part of speech tagging", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/armath.json b/datasets/armath.json new file mode 100644 index 0000000..e9b76e6 --- /dev/null +++ b/datasets/armath.json @@ -0,0 +1,36 @@ +{ + "Name": "ArMATH", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArMATH", + "Link": "https://github.com/reem-codes/ArMATH", + "License": "unknown", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "There are 6000 samples and 883 templates. A template is an equation once the variables have been replaced with ordered placeholders.", + "Volume": "6,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "ArMATH: a Dataset for Solving Arabic Math Word Problems", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "math solving", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Reem Ali Alghamdi, Zhenwen Liang and Xiangliang Zhang", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/armi__arabic_misogynistic_dataset.json b/datasets/armi__arabic_misogynistic_dataset.json new file mode 100644 index 0000000..d40bec5 --- /dev/null +++ b/datasets/armi__arabic_misogynistic_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "ArMI: Arabic Misogynistic Dataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/bilalghanem/armi", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arabic multidialectal dataset for misogynistic language", + "Volume": "9,833", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "The ORSAM Center for Middle Eastern Studies, University of Alberta", + "Derived From": "nan", + "Paper Title": "ArMI at FIRE 2021: Overview of the First Shared Task on Arabic Misogyny Identification", + "Paper Link": "http://ceur-ws.org/Vol-3159/T5-1.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "misogyny identification", + "Venue Title": "FIRE", + "Citations": "6.0", + "Venue Type": "conference", + "Venue Name": "Forum for Information Retrieval Evaluation", + "Authors": "Hala Mulki, Bilal Ghanem", + "Affiliations": "ORSAM Center for Middle Eastern Studies, Turkey; University of Alberta, Canada", + "Abstract": "This paper provides an overview of the organization, results and main findings of the first\nshared task on misogyny identification in Arabic tweets. Arabic Misogyny Identification task\n(ArMI) is introduced within the Hate Speech and Offensive Content detection (HASOC)\ntrack at FIRE-2021. The ArMI task combines two related classification subtasks: a main\nbinary classification subtask for detecting the presence of misogynistic language, and a finegrained multi-class classification subtask for identifying seven misogynistic behaviors found in misogynistic contents. The data provided for this task is a Twitter dataset composed of 9,833 tweets written in modern standard Arabic (MSA) and several Arabic dialects including Levantine, Egyptian and Gulf. ArMI at FIRE-2021 has got a total of 15 submitted runs for Sub-task A and 13 runs for Sub-task B provided by six different teams. The systems introduced by the participants employed various methods including feature-based, neural networks using either classical machine learning techniques, ensemble methods or transformers. The best performing system achieved an F-measure of 91.4% and 66.5% for subtask A and subtask B, respectively. This indicates that misogynistic language detection and misogynistic behaviors identification in Arabic textual contents can be, effectively, addressed using transformer-based approaches.", + "Added By": "Hala Mulki" +} \ No newline at end of file diff --git a/datasets/arparallel.json b/datasets/arparallel.json new file mode 100644 index 0000000..116a452 --- /dev/null +++ b/datasets/arparallel.json @@ -0,0 +1,36 @@ +{ + "Name": "arparallel", + "Subsets": [], + "HF Link": "nan", + "Link": "http://www.cs.cmu.edu/~fraisi/arabic/arparallel/", + "License": "CC BY 3.0", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "The first monolingual parallel corpus of Arabic generated automatically from translating a bilingual English-French corpus. It can be used to train sequence-to-sequence models for paraphrasing, machine translation, text simplification, and other language generation tasks.", + "Volume": "100,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Carnegie Mellon University", + "Derived From": "Europarl-v7 English-French parallel corpus", + "Paper Title": "A Monolingual Parallel Corpus of Arabic", + "Paper Link": "https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050918X00192/1-s2.0-S1877050918321914/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEC0aCXVzLWVhc3QtMSJIMEYCIQCbQvxVlZ9GIUmIOvgPUAl1BqNBeWsStsWYH5rg%2BT1HqgIhAPrEmo9yzsahnXBTJzuhZbCPg2kpfd12iRAEKBMToduAKrMFCFYQBRoMMDU5MDAzNTQ2ODY1Igwm4ZFMRlungcpl2U4qkAU8nYrBMd7oiFvmaJTOPWzC3L2uw1kahG0Yrkula18wW0zx%2FIUkV3V6LItDPYC31iLDYwbIEC5g1PrzAcKScy1EWnaWYOidb4N71kI8BzxhNvwOoyPAXkJMhBf%2FDOCL8Q%2FV2VXT%2FuBW9y1zF5o%2F1F49DHcLumSE9Lm7suYDvArrmn7FQ9tfjEsTa7NbKbgNMsvWYWCJae9cUYPEeAVWRVLyoC0R9q0YCzRTz%2F8PdZtLq5TmIxeoNjNbPPJMErPeU8rFn7nyhYoWY0RdJPduL2OYU3y%2Fg%2Fnwhvjp6ken6Z0NqDnl4YWq5QuqC78O199%2BlNEBugyARzjt2KAu7Tl0mu1YTnaXpri6prtemy1fL5pn5GPmcjtAbhHntSL1lOCtLafdjQsC8G1D3JuKr%2F%2B9%2FhxvoZN4jEWCNfKVnyriSO%2FrFvhNqx7F%2BZlHogHPraPbPMK%2BXhSufvAscyIBdzD3OL%2B%2FQky2gWz8iHvXl6RySxLhAXLwrqkNquO6nynCjvQ5BsWH%2FH2iAnGNkwNT7%2B1NCwhCA4CqSVA9H72kau4v0w2wgO%2FCLbV4cYFseWi6cNPCiPiaJwq%2FjzjHQ385N2NzUKuFxNg2bM5EfIXijGHJvzD5hA2AFmG74B4up232ReZWLPneXFcbKeTS71u0FaLZXxCzpaV%2BZc6oagDe%2Bhp57ExBsoSnzUud9%2BkZcNamxKzpA2Gr0PL3ag3%2FhXNIn0lYh0wcnMJNArcnfRhCKjHceb4X%2BVLcFFT5fhqxUK36gs5gebTNkcXVhUlC6%2B49p8%2Bk0aLZMueDnxZ%2B9vZkI9g439epOJiW%2Br6IlbatFcs8Fy0Cec%2FTWHm96DDUEMdmjsKxfq%2FVgVj5paFSSU8F38us6vabCzCN%2FPm2BjqwAb3UZR4cdrow6VtHEg4fvhpw7oxSngzeKNzydVoQcSASHKrOB5AAEqNmFxN4Ywr8Lx9mkIXaVvMIInd2tHv5SOfhKAw2cmYZZ1XzWvOh8QJC%2F2gN1v4qYBsDckmV5XUYZeTovzZhMn2Cp8nLI9f786fqdh%2FMajAXuK%2Fex6cDBcc2Gd5bKeTdlNPn4o4jzEZ8CJvTEi9i3SJqXJqoBuLt%2Bp38j6Yu5fMqycsvNX8UVLkE&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240909T054833Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTYUUT2AQTS%2F20240909%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=414eec0d0f0d5a45ad2d256adea15fe60e1bc0a192948f11eb3b0d71c4b30273&hash=8b112bc1cce1c6665a5739e0ab392dc42397c9c6ae1b9d15b8c5173fd382656d&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050918321914&tid=spdf-eb173916-2ec2-4fd7-8ad8-818dcda8cb5d&sid=0873a5bd51e572450a0af2f40180fda8c32fgxrqb&type=client&tsoh=d3d3LnNjaWVuY2VkaXJlY3QuY29t&ua=1f055a03575004560b&rr=8c04e3d71ecb6edb&cc=qa", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, summarization, text simplification, language modeling, information retrieval ", + "Venue Title": "ACLing", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": " 4th International Conference on Arabic Computational Linguistics ", + "Authors": "Fatima Al-Raisi, Weijian Lin, Abdelwahab Bourai", + "Affiliations": "Carnegie Mellon University, Sultan Qaboos University, Facebook, Uber Advanced Technologies Group", + "Abstract": " The paper introduces the first monolingual parallel corpus of Arabic, generated automatically from a parallel bilingual corpus. It aims to address the lack of Arabic monolingual data for tasks such as paraphrasing and text generation. The corpus includes different versions of varying sizes and can be used to train sequence-to-sequence models. The quality of the generated corpus has been evaluated and is made publicly available.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/arpc__a_corpus_for_paraphrase_identification_in_arabic_text.json b/datasets/arpc__a_corpus_for_paraphrase_identification_in_arabic_text.json new file mode 100644 index 0000000..9ce2105 --- /dev/null +++ b/datasets/arpc__a_corpus_for_paraphrase_identification_in_arabic_text.json @@ -0,0 +1,36 @@ +{ + "Name": "ARPC: A Corpus for Paraphrase Identification in Arabic Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://ieee-dataport.org/documents/arpc-corpus-paraphrase-identification-arabic-text#files", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "ArPC is an Arabic paraphrase identification corpus. It consists of 1331 sentence pairs along with their binary score that indicates weather the pairs are paraphrase or not. The corpus has been manually annotated by three Arabic native speakers.", + "Volume": "1,331", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "With-Fee", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "paraphrase identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arpod.json b/datasets/arpod.json new file mode 100644 index 0000000..5f141d6 --- /dev/null +++ b/datasets/arpod.json @@ -0,0 +1,73 @@ +{ + "Name": "ArPod", + "Subsets": [ + { + "Name": "KSA", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "2.33", + "Unit": "hours" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "0.83", + "Unit": "hours" + }, + { + "Name": "SYR", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "0.83", + "Unit": "hours" + }, + { + "Name": "EGY", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "1.5", + "Unit": "hours" + }, + { + "Name": "LEB", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "1.5", + "Unit": "hours" + }, + { + "Name": "ENG", + "Dialect": "mixed", + "Volume": "0.83", + "Unit": "hours" + } + ], + "HF Link": "https://hf.co/datasets/arbml/ArPod", + "Link": "https://www.kaggle.com/datasets/corpora4research/arpod-corpus-based-on-arabic-podcasts", + "License": "unknown", + "Year": 2019, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The corpus has a duration of 8.1 hours sampled at 16 kHz and coded on 16 bits. The considered languages are: MSA, English; whereas the dialects are: Syrian, Saudi Arabic, Egyptian, and Lebanese. 70% of this dataset had been used for training, and the remaining for the test phase.", + "Volume": "7.82", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Building a Speech Corpus based on Arabic Podcasts for Language and Dialect Identification", + "Paper Link": "https://aclanthology.org/W19-7408.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification from speech ", + "Venue Title": "ICNLSP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference on Natural Language and Speech Processing", + "Authors": "Khaled Lounnas, Mourad Abbas, Mohamed Lichouri\n", + "Affiliations": "USTHB University, Algeria; CRSTDLA;CRSTDLA", + "Abstract": "In this paper, we present ArPod, a new Arabic speech\ncorpus made of Arabic audio podcasts. We built this\ndataset, mainly for both speech-based multi-lingual and\nmulti-dialectal identification tasks. It includes two languages: Modern Standard Arabic (MSA) and English,\nand four Arabic dialects: Saudi, Egyptian, Lebanese\nand Syrian. A set of supervised classifiers have been\nused: Support Vector Machines (SVM), Multi Layer\nPerceptron (MLP), K-Nearest Neighbors (KNN), Extratrees and Convolutional Neural Networks (CNN),\nusing acoustic and spectral features. For both tasks,\nSVM yielded encouraging results and outperformed the\nother classifiers. Language Identification, Dialect\nIdentification, CNN, Acoustic features, spectral features, SVM, Arabic Podcast\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arqat-aqi__answerable_question_identification_in_arabic_tweets.json b/datasets/arqat-aqi__answerable_question_identification_in_arabic_tweets.json new file mode 100644 index 0000000..bb9820e --- /dev/null +++ b/datasets/arqat-aqi__answerable_question_identification_in_arabic_tweets.json @@ -0,0 +1,36 @@ +{ + "Name": "ArQAT-AQI: Answerable Question Identification in Arabic Tweets", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AQweets", + "Link": "https://www.dropbox.com/sh/coba3b1nqkyloa8/AAC4Sk5WQvtXZRgH5liBkMiGa?dl=0", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "other", + "Description": "Answerable Question Identification in Arabic Tweets", + "Volume": "13,252", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Dropbox", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "answerable questions", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arquad.json b/datasets/arquad.json new file mode 100644 index 0000000..d0c7869 --- /dev/null +++ b/datasets/arquad.json @@ -0,0 +1,36 @@ +{ + "Name": "ArQuAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArQuAD", + "Link": "https://github.com/RashaMObeidat/ArQuAD", + "License": "unknown", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a large MRC dataset for the Arabic language. The dataset comprises 16,020 questions posed by language experts on passages extracted from Arabic Wikipedia articles, where the answer to each question is a text segment from the corresponding reading passage.", + "Volume": "16,020", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Jordan University of Science and Technology", + "Derived From": "nan", + "Paper Title": "ArQuAD: An Expert-Annotated Arabic Machine Reading Comprehension Dataset", + "Paper Link": "https://link.springer.com/article/10.1007/s12559-024-10248-6", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "Cognitive Computation", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Cognitive Computation", + "Authors": "Rasha Obeidat, Marwa Al-Harbi, Mahmoud Al-Ayyoub,Luay Alawneh ", + "Affiliations": "nan", + "Abstract": "Machine Reading Comprehension (MRC) is a task that enables machines to mirror key cognitive processes involving reading, comprehending a text passage, and answering questions about it. There has been significant progress in this task for English in recent years, where recent systems not only surpassed human-level performance but also demonstrated advancements in emulating complex human cognitive processes. However, the development of Arabic MRC has not kept pace due to language challenges and the lack of large-scale, high-quality datasets. Existing datasets are either small, low quality or released as a part of large multilingual corpora. We present the Arabic Question Answering Dataset (ArQuaD), a large MRC dataset for the Arabic language. The dataset comprises 16,020 questions posed by language experts on passages extracted from Arabic Wikipedia articles, where the answer to each question is a text segment from the corresponding reading passage. Besides providing various dataset analyses, we fine-tuned several pre-trained language models to obtain benchmark results. Among the compared methods, AraBERTv0.2-large achieved the best performance with an exact match of 68.95% and an F1-score of 87.15%. However, the significantly higher performance observed in human evaluations (exact match of 86% and F1-score of 95.5%) suggests a significant margin of possible improvement in future research. We release the dataset publicly at https://github.com/RashaMObeidat/ArQuAD to encourage further development of language-aware MRC models for the Arabic language.\n\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arsarcasm-v2.json b/datasets/arsarcasm-v2.json new file mode 100644 index 0000000..46cfd47 --- /dev/null +++ b/datasets/arsarcasm-v2.json @@ -0,0 +1,67 @@ +{ + "Name": "ArSarcasm-v2", + "Subsets": [ + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "2,981", + "Unit": "sentences" + }, + { + "Name": "Gulf", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "966", + "Unit": "sentences" + }, + { + "Name": "Levantine", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "671", + "Unit": "sentences" + }, + { + "Name": "Maghrebi", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "45", + "Unit": "sentences" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "10,885", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/ArSarcasm_v2", + "Link": "https://github.com/iabufarha/ArSarcasm-v2", + "License": "MIT License", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "ArSarcasm-v2 is an extension of the original ArSarcasm dataset published along with the paper From Arabic Sentiment Analysis to Sarcasm Detection: The ArSarcasm Dataset. ArSarcasm-v2 consists of ArSarcasm along with portions of DAICT corpus and some new tweets. Each tweet was annotated for sarcasm, sentiment and dialect. The final dataset consists of 15,548 tweets divided into 12,548 training tweets and 3,000 testing tweets. ArSarcasm-v2 was used and released as a part of the shared task on sarcasm detection and sentiment analysis in Arabic", + "Volume": "15,548", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "ArSarcasm: https://github.com/iabufarha/ArSarcasm", + "Paper Title": "Overview of the WANLP 2021 Shared Task on Sarcasm and Sentiment Detection in Arabic", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.36/", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sarcasm detection, sentiment analysis, dialect identification", + "Venue Title": "WANLP", + "Citations": "20.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Ibrahim Abu Farha, Wajdi Zaghouani, Walid Magdy", + "Affiliations": "The University of Edinburgh, Hamad Bin Khalifa University", + "Abstract": "This paper provides an overview of the WANLP 2021 shared task on sarcasm and sentiment detection in Arabic. The shared task has two subtasks: sarcasm detection (subtask 1) and sentiment analysis (subtask 2). This shared task aims to promote and bring attention to Arabic sarcasm detection, which is crucial to improve the performance in other tasks such as sentiment analysis. The dataset used in this shared task, namely ArSarcasm-v2, consists of 15,548 tweets labelled for sarcasm, sentiment and dialect. We received 27 and 22 submissions for subtasks 1 and 2 respectively. Most of the approaches relied on using and fine-tuning pre-trained language models such as AraBERT and MARBERT. The top achieved results for the sarcasm detection and sentiment analysis tasks were 0.6225 F1-score and 0.748 F1-PN respectively.", + "Added By": "Ibrahim Abu Farha" +} \ No newline at end of file diff --git a/datasets/arsarcasm.json b/datasets/arsarcasm.json new file mode 100644 index 0000000..0186e58 --- /dev/null +++ b/datasets/arsarcasm.json @@ -0,0 +1,67 @@ +{ + "Name": "ArSarcasm", + "Subsets": [ + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "2,383", + "Unit": "sentences" + }, + { + "Name": "Gulf", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "519", + "Unit": "sentences" + }, + { + "Name": "Levantine", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "551", + "Unit": "sentences" + }, + { + "Name": "Maghrebi", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "32", + "Unit": "sentences" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "7,062", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/iabufarha/ar_sarcasm", + "Link": "https://github.com/iabufarha/ArSarcasm", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The dataset was created using previously available Arabic sentiment analysis datasets", + "Volume": "8,437", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "ASTD", + "Paper Title": "From Arabic Sentiment Analysis to Sarcasm Detection:\r\nThe ArSarcasm Dataset", + "Paper Link": "https://aclanthology.org/2020.osact-1.5.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "dialect identification, sentiment analysis, sarcasm detection", + "Venue Title": "OSACT", + "Citations": "23.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Ibrahim Abu Farha,Walid Magdy", + "Affiliations": "University of Edinburgh,The University of Edinburgh", + "Abstract": "Sarcasm is one of the main challenges for sentiment analysis systems. Its complexity comes from the expression of opinion using implicit indirect phrasing. In this paper, we present ArSarcasm, an Arabic sarcasm detection dataset, which was created through the reannotation of available Arabic sentiment analysis datasets. The dataset contains 10,547 tweets, 16% of which are sarcastic. In addition to sarcasm the data was annotated for sentiment and dialects. Our analysis shows the highly subjective nature of these tasks, which is demonstrated by the shift in sentiment labels based on annotators\u2019 biases. Experiments show the degradation of state-of-the-art sentiment analysers when faced with sarcastic content. Finally, we train a deep learning model for sarcasm detection using BiLSTM. The model achieves an F1 score of 0.46, which shows the challenging nature of the task, and should act as a basic baseline for future research on our dataset.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arsarcasmoji.json b/datasets/arsarcasmoji.json new file mode 100644 index 0000000..58c7d47 --- /dev/null +++ b/datasets/arsarcasmoji.json @@ -0,0 +1,36 @@ +{ + "Name": "ArSarcasMoji", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArSarcasMoji", + "Link": "https://github.com/ShathaHakami/ArSarcasMoji-Dataset", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "other", + "Description": "A dataset of 24,630 emoji-augmented Arabic texts, with 17.5% that are ironic (either humorous or sarcastic).", + "Volume": "24,630", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Multiple Institutions", + "Derived From": "AraSenCorpus,ArCovid_19,ArSarcasm,Kawarith,ATSAD,ArSAS,TEAD,ASAD, etc.", + "Paper Title": "ArSarcasMoji Dataset: The Emoji Sentiment Roles in Arabic Ironic", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.18.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, irony detection", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "Shatha Ali A. Hakami,Robert Hendley,Phillip Smith", + "Affiliations": "nan", + "Abstract": "In digital communication, emoji are essential\nin decoding nuances such as irony, sarcasm,\nand humour. However, their incorporation in\nArabic natural language processing (NLP) has\nbeen cautious because of the perceived complexities of the Arabic language. This paper\nintroduces ArSarcasMoji, a dataset of 24,630\nemoji-augmented texts, with 17. 5% that shows\nirony. Through our analysis, we highlight specific emoji patterns paired with sentiment roles\nthat denote irony in Arabic texts. The research\ncounters prevailing notions, emphasising the\nimportance of emoji\u2019s role in understanding\nArabic textual irony, and addresses their potential for accurate irony detection in Arabic\ndigital content.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arsas.json b/datasets/arsas.json new file mode 100644 index 0000000..fb6db00 --- /dev/null +++ b/datasets/arsas.json @@ -0,0 +1,36 @@ +{ + "Name": "ArSAS", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArSAS", + "Link": "https://homepages.inf.ed.ac.uk/wmagdy/resources.htm", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A set of 21K Arabic tweets labeled for 4 classes of sentiment and 6 classes of speech-act", + "Volume": "21,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "ArSAS: An Arabic Speech-Act and Sentiment Corpus of Tweets\r", + "Paper Link": "http://lrec-conf.org/workshops/lrec2018/W30/pdf/22_W30.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, speech act classification", + "Venue Title": "LREC", + "Citations": "25.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "AbdelRahim Elmadany,Hamdy Mubarak,Walid Magdy", + "Affiliations": "University of British Columbia,,The University of Edinburgh", + "Abstract": "Speech acts are the type of communicative acts within a conversation. Speech act recognition (aka classification) has been an active research in recent years. However, much less attention was directed towards this task in Arabic due to the lack of resources for training an Arabic speech-act classifier. In this paper we present ArSAS , an Arabic corpus of tweets annotated for the tasks of speech-act recognition and sentiment analysis. A large set of 21k Arabic tweets covering multiple topics were collected, prepared and annotated for six different classes of speech-act labels, such as expression, assertion, and question. In addition, the same set of tweets were also annotated with four classes of sentiment. We aim to have this corpus promoting the research in both speech-act recognition and sentiment analysis tasks for Arabic language.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arsen-20.json b/datasets/arsen-20.json new file mode 100644 index 0000000..90b4358 --- /dev/null +++ b/datasets/arsen-20.json @@ -0,0 +1,36 @@ +{ + "Name": "ArSen-20", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arsen_20", + "Link": "https://github.com/123fangyang/ArSen-20", + "License": "Apache-2.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "ArSen-20, a benchmark dataset tailored to propel Arabic sentiment detection forward. ArSen-20 comprises 20,000 professionally labeled tweets sourced from Twitter, focusing on the theme of COVID-19 and spanning the period from 2020 to 2023", + "Volume": "20,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "English-Arabic Phonetic Dataset construction", + "Paper Link": "https://www.bio-conferences.org/articles/bioconf/pdf/2024/16/bioconf_iscku2024_00057.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "ISCKU", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "International Scientific Conference of Alkafeel University", + "Authors": "Zaid Rajih Mohammed1,Ahmed H. Aliwy ", + "Affiliations": "nan", + "Abstract": " In the field of natural language processing, the effectiveness of a semantic similarity\ntask is significantly influenced by the presence of an extensive corpus. While numerous\nmonolingual corpora exist, predominantly in English, the availability of multilingual resources remains quite restricted. In this study, we present a semi- automated framework designed for generating a multilingual phonetic English- Arabic corpus, specifically tailored for application in multilingual phonetically and semantic similarity tasks. The proposed model consists of four phases: data gathering, preprocessing and translation, extraction IPA representation, and manual correction. Four datasets were used one of them was constructed from many sources. A manual correction was used at all the levels of the system to produce a golden standard dataset. The final dataset was in the form (English Word, English Phonetic, equivalent Arabic Word, and Arabic Phonetic). Also, a deep learning approach was used for extracting International Phonetic Alphabet\n(IPA) phonetic representation where the results for 13400 samples show that the Phonetic Error Rate (PER) and accuracy were 11.96% and 88.04 % respectively which are good results for producing IPA representation for unknown English and Arabic names.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arsenl.json b/datasets/arsenl.json new file mode 100644 index 0000000..beb166b --- /dev/null +++ b/datasets/arsenl.json @@ -0,0 +1,36 @@ +{ + "Name": "ArSenL", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArSenL", + "Link": "http://oma-project.com/ArSenL/download_intro", + "License": "custom", + "Year": 2014, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": " large scale\r\nStandard Arabic sentiment lexicon (ArSenL) using a combination of existing resources: ESWN, Arabic WordNet, and\r\nthe Standard Arabic Morphological Analyzer (SAMA)", + "Volume": "28,760", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "AUB, NYUAD", + "Derived From": "nan", + "Paper Title": "A Large Scale Arabic Sentiment Lexicon\r\nfor Arabic Opinion Mining", + "Paper Link": "https://aclanthology.org/W14-3623.pdf", + "Script": "Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "part of speech tagging, sentiment analysis", + "Venue Title": "WANLP", + "Citations": "130.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Gilbert Badaro,R. Baly,Hazem M. Hajj,Nizar Habash,W. El-Hajj", + "Affiliations": ",,,,", + "Abstract": "Most opinion mining methods in English rely successfully on sentiment lexicons, such as English SentiWordnet (ESWN). While there have been efforts towards building Arabic sentiment lexicons, they suffer from many deficiencies: limited size, unclear usability plan given Arabic\u2019s rich morphology, or nonavailability publicly. In this paper, we address all of these issues and produce the first publicly available large scale Standard Arabic sentiment lexicon (ArSenL) using a combination of existing resources: ESWN, Arabic WordNet, and the Standard Arabic Morphological Analyzer (SAMA). We compare and combine two methods of constructing this lexicon with an eye on insights for Arabic dialects and other low resource languages. We also present an extrinsic evaluation in terms of subjectivity and sentiment analysis.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arsentd-lev.json b/datasets/arsentd-lev.json new file mode 100644 index 0000000..a84fb59 --- /dev/null +++ b/datasets/arsentd-lev.json @@ -0,0 +1,36 @@ +{ + "Name": "ArSenTD-LEV", + "Subsets": [], + "HF Link": "https://hf.co/datasets/ramybaly/arsentd_lev", + "Link": "http://oma-project.com/ArSenL/ArSenTD_Lev_Intro", + "License": "custom", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "ArSentD-LEV is a multi-topic corpus for target-based sentiment analysis in Arabic Levantine tweets", + "Volume": "4,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "American University of Beirut", + "Derived From": "nan", + "Paper Title": "ArSentD-LEV: A Multi-Topic Corpus for Target-based Sentiment Analysis in Arabic Levantine Tweets", + "Paper Link": "https://paperswithcode.com/paper/190601830", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "LREC", + "Citations": "45.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Ramy Baly, Alaa Khaddaj, Hazem Hajj, Wassim El-Hajj, Khaled Bashir Shaban", + "Affiliations": "(1) MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA 02139, USA (2) American University of Beirut, Electrical and Computer Engineering Department, Beirut, Lebanon (3) American University of Beirut, Computer Science Department, Beirut, Lebanon (4) Qatar University, Computer Science and Engineering Department, Doha, Qatar", + "Abstract": "Sentiment analysis is a highly subjective and challenging task. Its complexity further increases when applied to the Arabic language, mainly because of the large variety of dialects that are unstandardized and widely used in the Web, especially in social media. While many datasets have been released to train sentiment classifiers in Arabic, most of these datasets contain shallow annotation, only marking the sentiment of the text unit, as a word, a sentence or a document. In this paper, we present the Arabic Sentiment Twitter Dataset for the Levantine dialect (ArSenTD-LEV). Based on findings from analyzing tweets from the Levant region, we created a dataset of 4,000 tweets with the following annotations: the overall sentiment of the tweet, the target to which the sentiment was expressed, how the sentiment was expressed, and the topic of the tweet. Results confirm the importance of these annotations at improving the performance of a baseline sentiment classifier. They also confirm the gap of training in a certain domain, and testing in another domain", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file diff --git a/datasets/arsentiment.json b/datasets/arsentiment.json new file mode 100644 index 0000000..1dea2dc --- /dev/null +++ b/datasets/arsentiment.json @@ -0,0 +1,36 @@ +{ + "Name": "ArSentiment", + "Subsets": [], + "HF Link": "https://hf.co/datasets/hadyelsahar/ar_res_reviews", + "Link": "https://github.com/hadyelsahar/large-arabic-sentiment-analysis-resouces", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling", + "Description": "Automatically annotated Reviews in Domains of Movies, Hotels, Restaurants and Products", + "Volume": "42,692", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Nile University", + "Derived From": "nan", + "Paper Title": "Building Large Arabic Multi-domain Resources for Sentiment Analysis", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, review classification", + "Venue Title": "CICLing", + "Citations": "127.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computational Linguistics and Intelligent Text Processing", + "Authors": "Hady ElSahar,S. El-Beltagy", + "Affiliations": ",", + "Abstract": "While there has been a recent progress in the area of Arabic Sentiment Analysis, most of the resources in this area are either of limited size, domain specific or not publicly available. In this paper, we address this problem by generating large multi-domain datasets for Sentiment Analysis in Arabic. The datasets were scrapped from different reviewing websites and consist of a total of 33K annotated reviews for movies, hotels, restaurants and products. Moreover we build multi-domain lexicons from the generated datasets. Different experiments have been carried out to validate the usefulness of the datasets and the generated lexicons for the task of sentiment classification. From the experimental results, we highlight some useful insights addressing: the best performing classifiers and feature representation methods, the effect of introducing lexicon based features and factors affecting the accuracy of sentiment classification in general. All the datasets, experiments code and results have been made publicly available for scientific purposes.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/arsl21l.json b/datasets/arsl21l.json new file mode 100644 index 0000000..1b78313 --- /dev/null +++ b/datasets/arsl21l.json @@ -0,0 +1,36 @@ +{ + "Name": "ArSL21L", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArSL21L", + "Link": "https://data.mendeley.com/datasets/8hrn3bvdvk/1", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "images", + "Collection Style": "manual curation", + "Description": "Annotated Arabic Sign Language Letters Dataset (ArSL21L) consisting of 14202 images of 32 letter signs with various back- grounds collected from 50 people.", + "Volume": "14,202", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Al Ain University", + "Derived From": "nan", + "Paper Title": "ArSL21L: Arabic Sign Language Letter Dataset Benchmarking and an Educational Avatar for Metaverse Applications", + "Paper Link": "https://ieeexplore.ieee.org/abstract/document/9766497", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sign language detection", + "Venue Title": "EDUCON", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "IEEE Global Engineering Education Conference", + "Authors": "Ganzorig Batnasan,Munkhjargal Gochoo,Munkh-Erdene Otgonbold,Fady Alnajjar,Timothy K Shih", + "Affiliations": "nan", + "Abstract": "It is complicated for the PwHL (people with hearing loss) to make a relationship with social majority, which naturally demands an interactive auto computer systems that have ability to understand sign language. With a trending Metaverse applications using augmented reality (AR) and virtual reality (VR), it is easier and interesting to teach sign language remotely using an avatar that mimics the gesture of a person using AI (Artificial Intelligence)-based system. There are various proposed methods and datasets for English SL (sign language); however, it is limited for Arabic sign language. Therefore, we present our collected and annotated Arabic Sign Language Letters Dataset (ArSL21L) consisting of 14202 images of 32 letter signs with various backgrounds collected from 50 people. We benchmarked our ArSL21L dataset on state-of-the-art object detection models, i.e., 4 versions of YOLOv5. Among the models, YOLOv5l achieved the best result with COCOmAP of 0.83. Moreover, we provide comparison results of classification task between ArSL2018 dataset, the only Arabic sign language letter dataset for classification task, and our dataset by running classification task on in-house short video. The results revealed that the model trained on our dataset has a superior performance over the model trained on ArSL2018. Moreover, we have created our prototype avatar which can mimic the ArSL (Arabic Sign Language) gestures for Metaverse applications. Finally, we believe, ArSL21L and the ArSL avatar will offer an opportunity to enhance the research and educational applications for not only the PwHL, but also in general real and virtual world applications. Our ArSL21L benchmark dataset is publicly available for research use on the Mendeley.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/artest.json b/datasets/artest.json new file mode 100644 index 0000000..3c788a1 --- /dev/null +++ b/datasets/artest.json @@ -0,0 +1,36 @@ +{ + "Name": "ArTest", + "Subsets": [], + "HF Link": "nan", + "Link": "https://www.dropbox.com/s/openq7fgt3kd6jg/Artest-Test-Collection.zip?dl=0", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "ArTest was built on top of ArabicWeb'16 Web collection. If you are interested in getting the collection, please check our ArabicWeb16 Website", + "Volume": "10,529", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "ArabicWeb16", + "Paper Title": "ArTest: The First Test Collection for Arabic Web Search with Relevance Rationales", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Dropbox", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "relevance judgments, judgments rationale", + "Venue Title": "SIGIR", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Special Interest Group on Information Retrieval", + "Authors": "Maram Hasanain, Yassmine Barkallah, Reem Suwaileh, Mucahid Kutlu, Tamer Elsayed", + "Affiliations": "Multiple Institutions", + "Abstract": "The scarcity of Arabic test collections has long hindered information retrieval (IR) research over the Arabic Web. In this work, we present ArTest, the first large-scale test collection designed for the evaluation of ad-hoc search over the Arabic Web. ArTest uses ArabicWeb16, a collection of around 150M Arabic Web pages as the document collection, and includes 50 topics, 10,529 relevance judgments, and (more importantly) a rationale behind each judgment. To our knowledge, this is also the first IR test collection that includes rationales of primary assessors (i.e., topic developers) for their relevance judgments, exhibiting a useful resource for understanding the relevance phenomena. Finally, ArTest is made publicly-available for the research community.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/artrivia.json b/datasets/artrivia.json new file mode 100644 index 0000000..4436040 --- /dev/null +++ b/datasets/artrivia.json @@ -0,0 +1,36 @@ +{ + "Name": "ArTrivia", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/salrowili/ArTrivia/tree/main", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "10,045 question-answer-passages triplets extracted from Wikipedia passages", + "Volume": "10,045", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "ArTrivia: Harvesting Arabic Wikipedia to Build A New Arabic Question Answering Dataset", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.17/", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "We present ArTrivia, a new Arabic question-answering dataset consisting of more than 10,000 question-answer pairs along with relevant passages, covering a wide range of 18 diverse topics in Arabic. We created our dataset using a newly proposed pipeline that leverages diverse structured data sources from Arabic Wikipedia. Moreover, we conducted a comprehensive statistical analysis of ArTrivia and assessed the performance of each component in our pipeline. Additionally, we compared the performance of ArTrivia against the existing TyDi QA dataset using various experimental setups. Our analysis highlights the significance of often overlooked aspects in dataset creation, such as answer normalization, in enhancing the quality of QA datasets. Our evaluation also shows that ArTrivia presents more challenging and out-of-distribution questions to TyDi, raising questions about the feasibility of using ArTrivia as a complementary dataset to TyDi.", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/arvox.json b/datasets/arvox.json new file mode 100644 index 0000000..82d7ed7 --- /dev/null +++ b/datasets/arvox.json @@ -0,0 +1,36 @@ +{ + "Name": "ArVox", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArVox", + "Link": "https://www.kaggle.com/datasets/corpora4research/arpod-corpus-based-on-arabic-podcasts", + "License": "unknown", + "Year": 2019, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling", + "Description": "designed for Multilingual and Arabic Dialect Identification", + "Volume": "nan", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "A Language Identification System Based on Voxforge Speech Corpus", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-030-14118-9_53", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification from speech ", + "Venue Title": "AMLTA", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference on Advanced Machine Learning Technologies and Applications", + "Authors": "Khaled Lounnas, Mourad Abbas, Hocine Teffahi, Mohamed Lichouri ", + "Affiliations": "University of Sciences and Technology Houari Boumediene; CRSTDLA;University of Sciences and Technology Houari Boumediene; University of Sciences and Technology Houari Boumediene", + "Abstract": "In this work, we address the problem of identifying languages based on Voxforge speech corpus. We downloaded corpora for three languages: English, German and Persian from Voxforge. In addition, we recorded two additional corpora, the first one for Modern Standard Arabic (MSA) and the other one for Kabyl, one of the Algerian Berber dialects. To tackle this task, we used three classifiers, namely: k-Nearest Neighbors (kNN), Support Vector Machines (SVM) and Extra Trees Classifier. We obtained an average precision of 87.45% for binary classification compared to 44% for the multi-class one.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arwiki.json b/datasets/arwiki.json new file mode 100644 index 0000000..6b84b3e --- /dev/null +++ b/datasets/arwiki.json @@ -0,0 +1,36 @@ +{ + "Name": "arwiki", + "Subsets": [], + "HF Link": "https://hf.co/datasets/CALM/arwiki", + "Link": "https://hf.co/datasets/CALM/arwiki", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "This dataset is extracted using wikiextractor tool, from Wikipedia Arabic pages.", + "Volume": "1,136,455", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "CALM", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/arzen-multigenre.json b/datasets/arzen-multigenre.json new file mode 100644 index 0000000..7215db3 --- /dev/null +++ b/datasets/arzen-multigenre.json @@ -0,0 +1,36 @@ +{ + "Name": "Arzen-Multigenre", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArzEn_MultiGenre_songs,https://hf.co/datasets/arbml/ArzEn_MultiGenre_subtitles,https://hf.co/datasets/arbml/ArzEn_MultiGenre_novels", + "Link": "https://data.mendeley.com/datasets/6k97jty9xg/3", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "manual curation", + "Description": "parallel dataset of Egyptian Arabic song lyrics, novels, and TV show subtitles that are manually translated and aligned with their English counterparts.", + "Volume": "25,557", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "ArzEn-MultiGenre: An aligned parallel dataset of Egyptian Arabic song lyrics, novels, and subtitles, with English translations", + "Paper Link": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4674389", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, language modeling, dialect identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/asad.json b/datasets/asad.json new file mode 100644 index 0000000..2ce7baf --- /dev/null +++ b/datasets/asad.json @@ -0,0 +1,36 @@ +{ + "Name": "ASAD", + "Subsets": [], + "HF Link": "nan", + "Link": "https://wti.kaust.edu.sa/solve/Arabic-Sentiment-Analysis-Challenge", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " ASAD is a large, high-quality annotated dataset\r\n(including 95K tweets), with three-class sentiment labels (positive, negative and neutral)", + "Volume": "100,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "ASAD: A TWITTER-BASED BENCHMARK ARABIC SENTIMENT ANALYSIS DATASET", + "Paper Link": "https://arxiv.org/pdf/2011.00578.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "ArXiv", + "Citations": "2.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Basma Alharbi,Hind Alamro,Manal Alshehri,Zuhair Khayyat,Manal Kalkatawi,I. Jaber,X. Zhang", + "Affiliations": ",,,,,,", + "Abstract": "This paper provides a detailed description of a new Twitter-based benchmark dataset for Arabic Sentiment Analysis (ASAD), which is launched in a competition3, sponsored by KAUST for awarding 10000 USD, 5000 USD and 2000 USD to the first, second and third place winners, respectively. Compared to other publicly released Arabic datasets, ASAD is a large, high-quality annotated dataset(including 95K tweets), with three-class sentiment labels (positive, negative and neutral). We presents the details of the data collection process and annotation process. In addition, we implement several baseline models for the competition task and report the results as a reference for the participants to the competition.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/asayar.json b/datasets/asayar.json new file mode 100644 index 0000000..2d0b464 --- /dev/null +++ b/datasets/asayar.json @@ -0,0 +1,36 @@ +{ + "Name": "ASAYAR", + "Subsets": [], + "HF Link": "nan", + "Link": "https://vcar.github.io/ASAYAR/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "other", + "Form": "images", + "Collection Style": "manual curation", + "Description": "The ASAYAR dataset is the first public dataset for Arabic and Latin scene text detection in highway traffic panels. It includes over 1800 annotated images and can be used to develop and evaluate models for traffic sign detection and text detection in Arabic and French.", + "Volume": "1800", + "Unit": "images", + "Ethical Risks": "Low", + "Provider": "Sidi Mohamed Ben Abdellah University - Fez Morocco", + "Derived From": "nan", + "Paper Title": "ASAYAR: A Dataset for Arabic-Latin Scene Text Localization in Highway Traffic Panels", + "Paper Link": "https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9233923 ", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "text detection, scene text localization, traffic sign detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "IEEE Transactions on Intelligent Transportation Systems", + "Authors": "M. Akallouch, K. S. Boujemaa, A. Bouhoute, K. Fardousse, I. Berrada", + "Affiliations": "Sidi Mohamed Ben Abdellah University - Fez Morocco", + "Abstract": " ASAYAR is the first public dataset for Arabic-Latin scene text localization in highway traffic panels, consisting of over 1800 images captured from the Moroccan Highway. It can be used for developing systems for Arabic and French text detection in traffic signs. This paper presents the dataset and evaluates methods for scene text detection in highway panels.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/ashaar.json b/datasets/ashaar.json new file mode 100644 index 0000000..d47c556 --- /dev/null +++ b/datasets/ashaar.json @@ -0,0 +1,36 @@ +{ + "Name": "ashaar", + "Subsets": [], + "HF Link": "https://hf.co/datasets/MagedSaeed/ashaar", + "Link": "https://hf.co/datasets/MagedSaeed/ashaar", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "ashaar: a dataset for Arabic poetry", + "Volume": "254,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "meter classification, poetry era classification, poetry theme classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Maged S. Alshaibani, Zaid Alyafeai", + "Affiliations": "King Fahud University of Petroleum and Minerals", + "Abstract": "nan", + "Added By": "Maged S. Alshaibani" +} \ No newline at end of file diff --git a/datasets/askfm.json b/datasets/askfm.json new file mode 100644 index 0000000..8203a1f --- /dev/null +++ b/datasets/askfm.json @@ -0,0 +1,36 @@ +{ + "Name": "ASKFM", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ASKFM", + "Link": "https://github.com/Omarito2412/ASKFM", + "License": "MIT License", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "This dataset is a merge of 98k questions and their respective answers as written by different authors on Askfm.", + "Volume": "98,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "information retrieval, question answering", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Omar Essam", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/astad.json b/datasets/astad.json new file mode 100644 index 0000000..4d68385 --- /dev/null +++ b/datasets/astad.json @@ -0,0 +1,36 @@ +{ + "Name": "ASTAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Sentiment_Analysis_Tweets", + "Link": "https://github.com/motazsaad/arabic-sentiment-analysis", + "License": "Apache-2.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "It contains 58K Arabic tweets (47K training, 11K test) tweets annotated in positive and negative labels", + "Volume": "58,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "An Arabic Tweets Sentiment Analysis Dataset (ATSAD) using Distant\r\nSupervision and Self Training\r", + "Paper Link": "https://aclanthology.org/2020.osact-1.1.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "OSACT", + "Citations": "4.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Kathrein Abu Kwaik,S. Chatzikyriakidis,Simon Dobnik,Motaz Saad,Richard Johansson", + "Affiliations": ",,,The Islamic University of Gaza,", + "Abstract": "As the number of social media users increases, they express their thoughts, needs, socialise and publish their opinions reviews. For good social media sentiment analysis, good quality resources are needed, and the lack of these resources is particularly evident for languages other than English, in particular Arabic. The available Arabic resources lack of from either the size of the corpus or the quality of the annotation. In this paper, we present an Arabic Sentiment Analysis Corpus collected from Twitter, which contains 36K tweets labelled into positive and negative. We employed distant supervision and self-training approaches into the corpus to annotate it. Besides, we release an 8K tweets manually annotated as a gold standard. We evaluated the corpus intrinsically by comparing it to human classification and pre-trained sentiment analysis models, Moreover, we apply extrinsic evaluation methods exploiting sentiment analysis task and achieve an accuracy of 86%.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/astd.json b/datasets/astd.json new file mode 100644 index 0000000..021a088 --- /dev/null +++ b/datasets/astd.json @@ -0,0 +1,36 @@ +{ + "Name": "ASTD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ASTD", + "Link": "https://github.com/mahmoudnabil/ASTD", + "License": "GPL-2.0", + "Year": 2015, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "10k Arabic sentiment tweets classified into four classes subjective positive, subjective negative, subjective mixed, and objective", + "Volume": "10,006", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Cairo University ", + "Derived From": "nan", + "Paper Title": "ASTD: Arabic Sentiment Tweets Dataset\r", + "Paper Link": "https://aclanthology.org/D15-1299.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "EMNLP", + "Citations": "178.0", + "Venue Type": "conference", + "Venue Name": "Conference on Empirical Methods in Natural Language Processing", + "Authors": "Mahmoud Nabil,Mohamed A. Aly,A. Atiya", + "Affiliations": ",,", + "Abstract": "This paper introduces ASTD, an Arabic social sentiment analysis dataset gathered from Twitter. It consists of about 10,000 tweets which are classified as objective, subjective positive, subjective negative, and subjective mixed. We present the properties and the statistics of the dataset, and run experiments using standard partitioning of the dataset. Our experiments provide benchmark results for 4 way sentiment classification on the dataset.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/at-odtsa.json b/datasets/at-odtsa.json new file mode 100644 index 0000000..782ab70 --- /dev/null +++ b/datasets/at-odtsa.json @@ -0,0 +1,36 @@ +{ + "Name": "AT-ODTSA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AT_ODSTA", + "Link": "https://github.com/sabudalfa/AT-ODTSA", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A dataset of Arabic Tweets for Open-Domain Targeted Sentiment Analysis, which includes Arabic tweets along with labels that specify targets (topics) and sentiments (opinions) expressed in the collected tweets.", + "Volume": "3,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "AT-ODTSA: a Dataset of Arabic Tweets for Open Domain Targeted Sentiment Analysis", + "Paper Link": "https://journal.uob.edu.bh/bitstream/handle/123456789/4607/IJCDS-1101105-1570749771.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "open-domain targeted sentiment Analysis", + "Venue Title": "IJCDS", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "International Journal of Computing and Digital Systems", + "Authors": "Shaaban Sahmoud, Shadi Abudalfa, Wisam Elmasry", + "Affiliations": "Department of Computer Engineering, Fatih Sultan Mehmet Vakif University, Information Technology Department, University College of Applied Sciences, 3Department of Computer Engineering, Istanbul Kultur University, ", + "Abstract": "In the field of sentiment analysis, most of research has conducted experiments on datasets collected from Twitter for\nmanipulating a specific language. Little number of datasets has been collected for detecting sentiments expressed in Arabic tweets.\nMoreover, very limited number of such datasets is suitable for conducting recent research directions such as target dependent sentiment\nanalysis and open-domain targeted sentiment analysis. Thereby, there is a dire need for reliable datasets that are specifically acquired\nfor open-domain targeted sentiment analysis with Arabic language. Therefore, in this paper, we introduce AT-ODTSA, a dataset of\nArabic Tweets for Open-Domain Targeted Sentiment Analysis, which includes Arabic tweets along with labels that specify targets\n(topics) and sentiments (opinions) expressed in the collected tweets. To the best of our knowledge, our work presents the first dataset\nthat manually annotated for applying Arabic open-domain targeted sentiment analysis. We also present a detailed statistical analysis of\nthe dataset. The AT-ODTSA dataset is suitable for train numerous machine learning models such as a deep learning-based model.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/atar.json b/datasets/atar.json new file mode 100644 index 0000000..af0304b --- /dev/null +++ b/datasets/atar.json @@ -0,0 +1,36 @@ +{ + "Name": "ATAR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Arabizi_Transliteration", + "Link": "https://github.com/bashartalafha/Arabizi-Transliteration", + "License": "CC BY-SA", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Arabizi transliteration", + "Volume": "2,743", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Atar: Attention-based LSTM for Arabizi transliteration\n", + "Paper Link": "http://ijece.iaescore.com/index.php/IJECE/article/view/22767/14781", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "transliteration", + "Venue Title": "IJECE", + "Citations": "0.0", + "Venue Type": "journal", + "Venue Name": "International Journal of Electrical and Computer Engineering", + "Authors": "Bashar Talafha,Analle Abuammar,M. Al-Ayyoub", + "Affiliations": ",,", + "Abstract": "A non-standard romanization of Arabic script, known as Arbizi, is widely used in Arabic online and SMS/chat communities. However, since state-of-the-art tools and applications for Arabic NLP expects Arabic to be written in Arabic script, handling contents written in Arabizi requires a special attention either by building customized tools or by transliterating them into Arabic script. The latter approach is the more common one and this work presents two significant contributions in this direction. The first one is to collect and publicly release the first large-scale \u201cArabizi to Arabic script\u201d parallel corpus focusing on the Jordanian dialect and consisting of more than 25 k pairs carefully created and inspected by native speakers to ensure highest quality. Second, we present Atar, an attention-based encoder-decoder model for Arabizi transliteration. Training and testing this model on our dataset yields impressive accuracy (79%) and BLEU score (88.49).", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/athar.json b/datasets/athar.json new file mode 100644 index 0000000..93015a1 --- /dev/null +++ b/datasets/athar.json @@ -0,0 +1,36 @@ +{ + "Name": "ATHAR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/mohamed-khalil/ATHAR", + "Link": "https://hf.co/datasets/mohamed-khalil/ATHAR", + "License": "CC BY-SA 4.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The ATHAR dataset comprises 66,000 translation pairs from Classical Arabic to English. It spans a wide array of subjects, aiming to enhance the development of NLP models specialized in Classical Arabic. This dataset addresses the gap in translation resources for Classical Arabic and is useful for fine-tuning large language models (LLMs) to improve their performance in handling ancient texts.", + "Volume": "66,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "ADAPT/DCU, Dublin, Ireland", + "Derived From": "Classical Arabic literary texts, collected primarily from Rasaif, a website that hosts a large collection of Classical Arabic books.", + "Paper Title": "ATHAR: A High-Quality and Diverse Dataset for Classical Arabic to English Translation", + "Paper Link": "https://arxiv.org/pdf/2407.19835", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Mohammed Khalil, Mohammed Sabry ", + "Affiliations": "Independent Researcher, ADAPT/DCU, Dublin, Ireland", + "Abstract": "ATHAR is a dataset comprising 66,000 high-quality translation pairs from Classical Arabic to English, aimed at improving NLP and machine translation systems. The dataset covers a broad range of Classical Arabic texts, including science, philosophy, and history. Evaluations on state-of-the-art language models indicate that fine-tuning with this dataset significantly improves translation accuracy.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/attimam.json b/datasets/attimam.json new file mode 100644 index 0000000..f471a2f --- /dev/null +++ b/datasets/attimam.json @@ -0,0 +1,36 @@ +{ + "Name": "AttImam", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2022T02", + "License": "LDC User Agreement for Non-Members", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "AttImam was developed by Al-Imam Mohammad Ibn Saud Islamic University and consists of approximately 2,000 attribution relations applied to Arabic newswire text from Arabic Treebank: Part 1 v 4.1 (LDC2010T13). Attribution refers to the process of reporting or assigning an utterance to the correct speaker.", + "Volume": "2,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Al-Imam Mohammad Ibn Saud Islamic University", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "250.00 $", + "Test Split": "No", + "Tasks": "discourse analysis, entity extraction, language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Amal Alsaif, Tasniem Alyahya, Madawi Alotibi, Huda Almuzaini, Abeer Alqahtani", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Ahmed Ruby" +} \ No newline at end of file diff --git a/datasets/author_attribution_tweets.json b/datasets/author_attribution_tweets.json new file mode 100644 index 0000000..296a3cf --- /dev/null +++ b/datasets/author_attribution_tweets.json @@ -0,0 +1,36 @@ +{ + "Name": "Author Attribution Tweets", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Author_Attribution_Tweets", + "Link": "https://fada.birzeit.edu/handle/20.500.11889/6743", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " consists of 71,397 tweets for 45 authors for MSA collected from twitter. ", + "Volume": "71,397", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Birzeit University", + "Derived From": "nan", + "Paper Title": "Authorship Attribution of Modern Standard Arabic Short Texts", + "Paper Link": "https://fada.birzeit.edu/bitstream/20.500.11889/6787/1/AA_PAPER___ACM.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "authorship attribution", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "YARA ABUHAMMAD, YARA ADDABE, NATALY AYYAD, ADNAN YAHYA", + "Affiliations": "Department of Electrical and Computer Engineering, Birzeit University, Palestine, Department of Electrical and Computer Engineering, Birzeit University, Palestine, Department of Electrical and Computer Engineering, Birzeit University, Palestine, Department of Electrical and Computer Engineering, Birzeit University, Palestine", + "Abstract": "Text data, including short texts, constitute a major share of web content. The availability of this data to billions of users triggers\nfrequent plagiarism attacks. Authorship Attribution (AA) seeks to identify the most probable author of a given text based on similarity\nto the writing style of potential authors. In this paper, we approach AA as a writing style profile generation process, where we group\ntext instances for each author into a single profile. We use Twitter as the source for our short Modern Standard Arabic (MSA) texts.\nNumerous experiments with various training approaches, tools and features allowed us to settle on a text representation method that\nrelies on text concatenation of Arabic tweets to form chunks, which are then duplicated to reach a precalculated length. These chunks\nare used to train machine learning models for our 45 author profiles. This allowed us to achieve accuracies up to 99%, which compares\nfavorably with the best results reported in the literature", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/autotweet.json b/datasets/autotweet.json new file mode 100644 index 0000000..000ab2c --- /dev/null +++ b/datasets/autotweet.json @@ -0,0 +1,36 @@ +{ + "Name": "AutoTweet", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AutoTweet", + "Link": "https://www.dropbox.com/s/amnv06boef2vn4k/Autotweet-Dataset-v1.0.zip?dl=0", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " classification of Arabic tweets into automated or manual.", + "Volume": "3,503", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "Detecting Automatically-Generated Arabic Tweets\r\n", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-319-28940-3_10", + "Script": "Arab", + "Tokenized": "No", + "Host": "Dropbox", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "gender detection", + "Venue Title": "AIRS", + "Citations": "10.0", + "Venue Type": "conference", + "Venue Name": "Conference on Alliance of Information and Referral Systems", + "Authors": "H. Almerekhi,T. Elsayed", + "Affiliations": ",", + "Abstract": "Recently, Twitter, one of the most widely-known social media platforms, got infiltrated by several automation programs, commonly known as \u201cbots\u201d. Bots can be easily abused to spread spam and hinder information extraction applications by posting lots of automatically-generated tweets that occupy a good portion of the continuous stream of tweets. This problem heavily affects users in the Arab region due to the recent developing political events as automated tweets can disturb communication and waste time needed in filtering such tweets.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/aya_dataset.json b/datasets/aya_dataset.json new file mode 100644 index 0000000..189522a --- /dev/null +++ b/datasets/aya_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Aya Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/aya_ar", + "Link": "https://hf.co/datasets/CohereForAI/aya_dataset", + "License": "Apache-2.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The Aya Dataset is a multilingual instruction fine-tuning dataset curated by an open-science community via Aya Annotation Platform from Cohere For AI. The dataset contains a total of 204k human-annotated prompt-completion pairs along with the demographics data of the annotators.", + "Volume": "14,250", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Cohere For AI", + "Derived From": "nan", + "Paper Title": "Aya Dataset: An Open-Access Collection for Multilingual Instruction Tuning", + "Paper Link": "https://arxiv.org/pdf/2402.06619.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering, instruction tuning", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Shivalika Singh , Freddie Vargus , Daniel Dsouza , B\u00f6rje F. Karlsson , Abinaya Mahendiran , Wei-Yin Ko , Herumb Sh,ilya , Jay Patel , Deividas Mataciunas , Laura OMahony , Mike Zhang , Ramith Hettiarachchi , Joseph Wilson , Marina Machado , Luisa Souza Moura , Dominik Krzemi\u0144ski , Hakimeh Fadaei , Irem Erg\u00fcn , Ifeoma Okoh , Aisha Alaagib , Oshan Mudannayake , Zaid Alyafeai , Vu Minh Chien , Sebastian Ruder , Surya Guthikonda , Emad A. Alghamdi , Sebastian Gehrmann , Niklas Muennighoff , Max Bartolo , Julia Kreutzer , Ahmet \u00dcst\u00fcn , Marzieh Fadaee and Sara Hooker\n", + "Affiliations": "nan", + "Abstract": "Datasets are foundational to many breakthroughs in modern artificial intelligence. Many recent\nachievements in the space of natural language processing (NLP) can be attributed to the finetuning of pre-trained models on a diverse set of tasks that enables a large language model (LLM)\nto respond to instructions. Instruction fine-tuning (IFT) requires specifically constructed and annotated datasets. However, existing datasets are almost all in the English language. In this work,\nour primary goal is to bridge the language gap by building a human-curated instruction-following\ndataset spanning 65 languages. We worked with fluent speakers of languages from around the world\nto collect natural instances of instructions and completions. Furthermore, we create the most extensive multilingual collection to date, comprising 513 million instances through templating and\ntranslating existing datasets across 114 languages. In total, we contribute four key resources: we\ndevelop and open-source the Aya Annotation Platform, the Aya Dataset, the Aya Collection,\nand the Aya Evaluation Suite. The Aya initiative also serves as a valuable case study in participatory research, involving collaborators from 119 countries. We see this as a valuable framework\nfor future research collaborations that aim to bridge gaps in resources.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ayatec.json b/datasets/ayatec.json new file mode 100644 index 0000000..10622c8 --- /dev/null +++ b/datasets/ayatec.json @@ -0,0 +1,36 @@ +{ + "Name": "AyaTEC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/AyaTEC", + "Link": "http://qufaculty.qu.edu.qa/telsayed/datasets/", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "QA on the Holy Qur\u2019an Dataset", + "Volume": "207", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "AyaTEC: Building a Reusable Verse-Based Test Collection for\r\nArabic Question Answering on the Holy Qur\u2019an", + "Paper Link": "https://dl.acm.org/doi/pdf/10.1145/3400396", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "TALLIP", + "Citations": "0.0", + "Venue Type": "journal", + "Venue Name": "ACM Transactions on Asian and Low-Resource Language Information Processing", + "Authors": "R. Malhas,Tamer Elsayed", + "Affiliations": ",", + "Abstract": "The absence of publicly available reusable test collections for Arabic question answering on the Holy Qur\u2019an has impeded the possibility of fairly comparing the performance of systems in that domain. In this article, we introduce AyaTEC, a reusable test collection for verse-based question answering on the Holy Qur\u2019an, which serves as a common experimental testbed for this task.AyaTEC includes 207 questions (with their corresponding 1,762 answers) covering 11 topic categories of the Holy Qur\u2019an that target the information needs of both curious and skeptical users. To the best of our effort, the answers to the questions (each represented as a sequence of verses) in AyaTECwere exhaustive\u2014that is, all qur\u2019anic verses that directly answered the questions were exhaustively extracted and annotated. To facilitate the use ofAyaTEC in evaluating the systems designed for that task, we propose several evaluation measures to support the different types of questions and the nature of verse-based answers while integrating the concept of partial matching of answers in the evaluation.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/baec.json b/datasets/baec.json new file mode 100644 index 0000000..92882e2 --- /dev/null +++ b/datasets/baec.json @@ -0,0 +1,55 @@ +{ + "Name": "BAEC", + "Subsets": [ + { + "Name": "SDC", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "210,396", + "Unit": "tokens" + }, + { + "Name": "EDC", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "218,149", + "Unit": "tokens" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "45,251", + "Unit": "tokens" + } + ], + "HF Link": "https://hf.co/datasets/arbml/BAEC", + "Link": "https://github.com/TaghreedT/BAEC", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "consists of 45,251 words and is 436 KB in size.It was collected from different Facebook pages", + "Volume": "473,796", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Compression versus traditional machine learning classifiers to detect code-switching in varieties and dialects: Arabic as a case study", + "Paper Link": "https://eprints.whiterose.ac.uk/155881/1/tarmom18nlejV8tex.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "codeswitch detection", + "Venue Title": "NLE", + "Citations": "1.0", + "Venue Type": "journal", + "Venue Name": "Natural Language Engineering", + "Authors": "Taghreed Tarmom,W. Teahan,E. Atwell,Mohammad Ammar Alsalka", + "Affiliations": ",,,", + "Abstract": "Abstract The occurrence of code-switching in online communication, when a writer switches among multiple languages, presents a challenge for natural language processing tools, since they are designed for texts written in a single language. To answer the challenge, this paper presents detailed research on ways to detect code-switching in Arabic text automatically. We compare the prediction by partial matching (PPM) compression-based classifier, implemented in Tawa, and a traditional machine learning classifier sequential minimal optimization (SMO), implemented in Waikato Environment for Knowledge Analysis, working specifically on Arabic text taken from Facebook. Three experiments were conducted in order to: (1) detect code-switching among the Egyptian dialect and English; (2) detect code-switching among the Egyptian dialect, the Saudi dialect, and English; and (3) detect code-switching among the Egyptian dialect, the Saudi dialect, Modern Standard Arabic (MSA), and English. Our experiments showed that PPM achieved a higher accuracy rate than SMO with 99.8% versus 97.5% in the first experiment and 97.8% versus 80.7% in the second. In the third experiment, PPM achieved a lower accuracy rate than SMO with 53.2% versus 60.2%. Code-switching between Egyptian Arabic and English text is easiest to detect because Arabic and English are generally written in different character sets. It is more difficult to distinguish between Arabic dialects and MSA as these use the same character set, and most users of Arabic, especially Saudis and Egyptians, frequently mix MSA with their dialects. We also note that the MSA corpus used for training the MSA model may not represent MSA Facebook text well, being built from news websites. This paper also describes in detail the new Arabic corpora created for this research and our experiments.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/baladi_lebanese_dialect_corpora.json b/datasets/baladi_lebanese_dialect_corpora.json new file mode 100644 index 0000000..d993732 --- /dev/null +++ b/datasets/baladi_lebanese_dialect_corpora.json @@ -0,0 +1,36 @@ +{ + "Name": "Baladi Lebanese dialect corpora", + "Subsets": [], + "HF Link": "nan", + "Link": "https://portal.sina.birzeit.edu/curras", + "License": "CC BY-NC-SA 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "The corpus consists of about 9.6K words/tokens collected from Facebook, blog posts and traditional poems. The corpus was annotated as an extension to Curras and following the same annotation methodology to form a Levantine Corpus.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Curras + Baladi: Towards a Levantine Corpus", + "Paper Link": "https://arxiv.org/pdf/2212.06468.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "Dropbox", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, speech recognition, dialect identification, named entity recognition, part of speech tagging, language identification, morphological analysis", + "Venue Title": "LERC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "LERC", + "Authors": "Karim El Haff, Mustafa Jarrar, Tymaa Hammouda, Fadi Zaraket", + "Affiliations": "nan", + "Abstract": "The processing of the Arabic language is a complex field of research. This is due to many factors, including the complex and rich morphology of Arabic, its high degree of ambiguity, and the presence of several regional varieties that need to be processed while taking into account their unique characteristics. When its dialects are taken into account, this language pushes the limits of NLP to find solutions to problems posed by its inherent nature. It is a diglossic language; the standard language is used in formal settings and in education and is quite different from the vernacular languages spoken in the different regions and influenced by older languages that were historically spoken in those regions. This should encourage NLP specialists to create dialect-specific corpora such as the Palestinian morphologically annotated Curras corpus of Birzeit University. In this work, we present the Lebanese Corpus Baladi that consists of around 9.6K morphologically annotated tokens. Since Lebanese and Palestinian dialects are part of the same Levantine dialectal continuum, and thus highly mutually intelligible, our proposed corpus was constructed to be used to (1) enrich Curras and transform it into a more general Levantine corpus and (2) improve Curras by solving detected errors.", + "Added By": "Mustafa Jarrar" +} \ No newline at end of file diff --git a/datasets/baved.json b/datasets/baved.json new file mode 100644 index 0000000..296a3bc --- /dev/null +++ b/datasets/baved.json @@ -0,0 +1,36 @@ +{ + "Name": "BAVED", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/BAVED", + "Link": "https://github.com/40uf411/Basic-Arabic-Vocal-Emotions-Dataset", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "Basic Arabic Vocal Emotions Dataset (BAVED) is a datasetthat contains an arabic words spelled in diffrent levels of emotions recorded in an audio/wav format", + "Volume": "1,935", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech emotion recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Emad A. Alghamdi" +} \ No newline at end of file diff --git a/datasets/bbn_aub_darpa_babylon_levantine_arabic_speech_and_transcripts.json b/datasets/bbn_aub_darpa_babylon_levantine_arabic_speech_and_transcripts.json new file mode 100644 index 0000000..426ab43 --- /dev/null +++ b/datasets/bbn_aub_darpa_babylon_levantine_arabic_speech_and_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "BBN/AUB DARPA Babylon Levantine Arabic Speech and Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005S08", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "BBN/AUB DARPA Babylon Levantine Arabic Speech and Transcripts was developed by BBN Technologies and contains 60.6 hours of spontaneous speech recorded from subjects speaking Levantine colloquial Arabic and associated transcripts. Levantine Arabic is the dialect of Arabic spoken in Lebanon, Jordan, Syria, and Palestine. It is significantly different from Modern Standard Arabic. It is a spoken rather than a written language, and includes different words and pronounciations from Modern Standard Arabic.", + "Volume": "60.6", + "Unit": "hours", + "Ethical Risks": "nan", + "Provider": "BBN Technologies ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,500 $", + "Test Split": "No", + "Tasks": "machine translation, speech recognition, spoken dialogue systems", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "BBN Technologies (with American University of Beirut a subcontractor), John Makhoul, Bushra Zawaydeh, Frederick Choi, David Stallard", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/bbn_blog_posts_sentiment_corpus.json b/datasets/bbn_blog_posts_sentiment_corpus.json new file mode 100644 index 0000000..e53dc41 --- /dev/null +++ b/datasets/bbn_blog_posts_sentiment_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "BBN Blog Posts Sentiment Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/BBN_Blog_Posts", + "Link": "https://saifmohammad.com/WebPages/ArabicSA.html", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A random subset of 1200 Levantine dialectal sentences\r\nchosen from the BBN Arabic-Dialect\u2013English Parallel Text", + "Volume": "1,200", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Sentiment after Translation: A Case-Study on Arabic Social Media Posts", + "Paper Link": "https://aclanthology.org/N15-1078.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis,machine translation", + "Venue Title": "NAACL", + "Citations": "125.0", + "Venue Type": "conference", + "Venue Name": "North American Chapter of the Association for Computational Linguistics", + "Authors": "Mohammad Salameh,Saif M. Mohammad,Svetlana Kiritchenko", + "Affiliations": ",National Research Council Canada,", + "Abstract": "When text is translated from one language into another, sentiment is preserved to varying degrees. In this paper, we use Arabic social media posts as stand-in for source language text, and determine loss in sentiment predictability when they are translated into English, manually and automatically. As benchmarks, we use manually and automatically determined sentiment labels of the Arabic texts. We show that sentiment analysis of English translations of Arabic texts produces competitive results, w.r.t. Arabic sentiment analysis. We discover that even though translation significantly reduces the human ability to recover sentiment, automatic sentiment systems are still able to capture sentiment information from the translations.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/belebele.json b/datasets/belebele.json new file mode 100644 index 0000000..45e15ac --- /dev/null +++ b/datasets/belebele.json @@ -0,0 +1,73 @@ +{ + "Name": "Belebele", + "Subsets": [ + { + "Name": "acm_Arab", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "900", + "Unit": "sentences" + }, + { + "Name": "arb_Arab", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "900", + "Unit": "sentences" + }, + { + "Name": "apc_Arab", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "900", + "Unit": "sentences" + }, + { + "Name": "ars_Arab", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "900", + "Unit": "sentences" + }, + { + "Name": "ary_Arab", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "900", + "Unit": "sentences" + }, + { + "Name": "arz_Arab", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "900", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/facebook/belebele", + "Link": "https://github.com/facebookresearch/belebele", + "License": "CC BY-NC-SA 4.0", + "Year": 2023, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "human translation", + "Description": "A multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants.", + "Volume": "5,400", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Meta AI", + "Derived From": "nan", + "Paper Title": "The BELEBELE Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants", + "Paper Link": "https://arxiv.org/pdf/2308.16884.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering, instruction tuning", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/bible_para.json b/datasets/bible_para.json new file mode 100644 index 0000000..42e0aa7 --- /dev/null +++ b/datasets/bible_para.json @@ -0,0 +1,36 @@ +{ + "Name": "bible_para", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/bible_para", + "Link": "https://hf.co/datasets/bible_para", + "License": "CC0", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "human translation", + "Description": "This is a multilingual parallel corpus created from translations of the Bible", + "Volume": "2,800,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "A massively parallel corpus: the Bible in 100 languages", + "Paper Link": "https://link.springer.com/content/pdf/10.1007/s10579-014-9287-y.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "Language Resources and Evaluation", + "Citations": "49.0", + "Venue Type": "journal", + "Venue Name": "Language Resources and Evaluation", + "Authors": "Christos Christodoulopoulos, Mark Steedman", + "Affiliations": "UIUC, University of Edinburgh", + "Abstract": "We describe the creation of a massively parallel corpus based on 100 translations of the Bible. We discuss some of the difficulties in acquiring and processing the raw material as well as the potential of the Bible as a corpus for natural language processing. Finally we present a statistical analysis of the corpora collected and a detailed comparison between the English translation and other English corpora.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/bnl_historical_newspapers.json b/datasets/bnl_historical_newspapers.json new file mode 100644 index 0000000..cc6e08c --- /dev/null +++ b/datasets/bnl_historical_newspapers.json @@ -0,0 +1,36 @@ +{ + "Name": "BnL Historical Newspapers", + "Subsets": [], + "HF Link": "https://hf.co/datasets/bnl-data/bnl_newspapers", + "Link": "https://data.bnl.lu/data/historical-newspapers/", + "License": "CC0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The BnL has digitised over 800.000 pages of Luxembourg newspapers", + "Volume": "1", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "BnL", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_arabic_discussion_forum_parallel_training_data.json b/datasets/bolt_arabic_discussion_forum_parallel_training_data.json new file mode 100644 index 0000000..352cb61 --- /dev/null +++ b/datasets/bolt_arabic_discussion_forum_parallel_training_data.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Arabic Discussion Forum Parallel Training Data", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2019T01", + "License": "LDC User Agreement for Non-Members", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "The source data in this release consists of discussion forum threads harvested from the Internet by LDC using a combination of manual and automatic processes. The full source data collection is released as BOLT Arabic Discussion Forums (LDC2018T10).", + "Volume": "2,651", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_arabic_discussion_forums.json b/datasets/bolt_arabic_discussion_forums.json new file mode 100644 index 0000000..d94e83d --- /dev/null +++ b/datasets/bolt_arabic_discussion_forums.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Arabic Discussion Forums", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018T10", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "BOLT Arabic Discussion Forums was developed by the Linguistic Data Consortium (LDC) and consists of 813,080 discussion forum threads in Egyptian Arabic harvested from the Internet using a combination of manual and automatic processes.", + "Volume": "813,080", + "Unit": "documents", + "Ethical Risks": "Medium", + "Provider": "University of Pennsylvania", + "Derived From": "nan", + "Paper Title": "BOLT Arabic Discussion Forums", + "Paper Link": "https://catalog.ldc.upenn.edu/LDC2018T10", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "With-Fee", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Jennifer Tracey, Haejoong Lee, Stephanie Strassel, Safa Ismael", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian-english_word_alignment_--_discussion_forum_training.json b/datasets/bolt_egyptian-english_word_alignment_--_discussion_forum_training.json new file mode 100644 index 0000000..13018a9 --- /dev/null +++ b/datasets/bolt_egyptian-english_word_alignment_--_discussion_forum_training.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian-English Word Alignment -- Discussion Forum Training", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2019T06", + "License": "LDC User Agreement for Non-Members", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of Egyptian source discussion forum threads harvested from the Internet by LDC using a combination of manual and automatic processes. The source data is released as BOLT Arabic Discussion Forums (LDC2018T10).", + "Volume": "400,448", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic-english_word_alignment_--_conversational_telephone_speech_training.json b/datasets/bolt_egyptian_arabic-english_word_alignment_--_conversational_telephone_speech_training.json new file mode 100644 index 0000000..bafa2c4 --- /dev/null +++ b/datasets/bolt_egyptian_arabic-english_word_alignment_--_conversational_telephone_speech_training.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic-English Word Alignment -- Conversational Telephone Speech Training", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2020T05", + "License": "LDC User Agreement for Non-Members", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "The source data in this release consists of transcripts of Egyptian Arabic conversational telephone speech (CTS) from LDC's CALLHOME and CALLFRIEND collections (LDC97S45, LDC97T19, LDC2002S37, LDC2002T38, LDC96S49) that were translated into English by professional translation agencies and annotated for the word alignment task.", + "Volume": "20,010", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic-english_word_alignment_--_sms_chat_training.json b/datasets/bolt_egyptian_arabic-english_word_alignment_--_sms_chat_training.json new file mode 100644 index 0000000..b86fb7e --- /dev/null +++ b/datasets/bolt_egyptian_arabic-english_word_alignment_--_sms_chat_training.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic-English Word Alignment -- SMS/Chat Training", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2019T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of Egyptian Arabic source text message and chat conversations collected using two methods: new collection via LDC's collection platform, and donation of SMS or chat archives from BOLT collection participants. The source data is released as BOLT Egyptian Arabic SMS/Chat and Transliteration (LDC2017T07).", + "Volume": "475,665", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic_co-reference_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json b/datasets/bolt_egyptian_arabic_co-reference_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json new file mode 100644 index 0000000..cfe68f9 --- /dev/null +++ b/datasets/bolt_egyptian_arabic_co-reference_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic Co-reference -- Discussion Forum, SMS/Chat, and Conversational Telephone Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2021T14", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "DF data was collected from the web using a combination of manual and automatic processes. SMS/Chat material was donated or collected via live platforms. CTS data was taken from LDC's Egyptian Arabic CALLHOME and CALLFRIEND telephone collections.", + "Volume": "nan", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,250.00 $", + "Test Split": "No", + "Tasks": "coreference resolution", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic_propbank_and_sense_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json b/datasets/bolt_egyptian_arabic_propbank_and_sense_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json new file mode 100644 index 0000000..959266d --- /dev/null +++ b/datasets/bolt_egyptian_arabic_propbank_and_sense_--_discussion_forum,_sms_chat,_and_conversational_telephone_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic PropBank and Sense -- Discussion Forum, SMS/Chat, and Conversational Telephone Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2021T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "DF data was collected from the web using a manual process. SMS/Chat material was donated or collected via live platforms. CTS data was taken from LDC's Egyptian Arabic CALLHOME and CALLFRIEND telephone collections.", + "Volume": "nan", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "300.00 $", + "Test Split": "No", + "Tasks": "entity extraction,part of speech tagging,question-answering,semantic role labelling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic_sms_chat_parallel_training_data.json b/datasets/bolt_egyptian_arabic_sms_chat_parallel_training_data.json new file mode 100644 index 0000000..9dc4c38 --- /dev/null +++ b/datasets/bolt_egyptian_arabic_sms_chat_parallel_training_data.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic SMS/Chat Parallel Training Data", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2021T15", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The source data in this release was collected using two methods: new collection via LDC's collection platform, and donation of SMS or chat archives from BOLT collection participants. All data were reviewed manually to exclude any messages/conversations that were not in the target language or that had sensitive content, such as personal identifying information.", + "Volume": "723,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic_treebank_-_conversational_telephone_speech.json b/datasets/bolt_egyptian_arabic_treebank_-_conversational_telephone_speech.json new file mode 100644 index 0000000..356d9dc --- /dev/null +++ b/datasets/bolt_egyptian_arabic_treebank_-_conversational_telephone_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic Treebank - Conversational Telephone Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2021T12", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "This release contains 153,171 tokens before clitics were split and 182,965 tree tokens after clitics were split for treebank annotation. The source data was selected from conversational telephone speech collected by LDC for the CALLHOME project that was transcribed and segmented into sentence units.", + "Volume": "153,171", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information detection,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic_treebank_-_discussion_forum.json b/datasets/bolt_egyptian_arabic_treebank_-_discussion_forum.json new file mode 100644 index 0000000..734e615 --- /dev/null +++ b/datasets/bolt_egyptian_arabic_treebank_-_discussion_forum.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic Treebank - Discussion Forum", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018T23", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "This release contains 440,448 tokens before clitics were split and 508,548 tree tokens after clitics were split for treebank annotation. The source material is web discussion forums collected by LDC from various sources.", + "Volume": "440,448", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,500.00 $", + "Test Split": "No", + "Tasks": "information detection,machine translation,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_egyptian_arabic_treebank_-_sms_chat.json b/datasets/bolt_egyptian_arabic_treebank_-_sms_chat.json new file mode 100644 index 0000000..cf1a45f --- /dev/null +++ b/datasets/bolt_egyptian_arabic_treebank_-_sms_chat.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Egyptian Arabic Treebank - SMS/Chat", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2021T17", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This release contains 349,414 tokens before clitics were split and 435,677 tree tokens after clitics were split for treebank annotation. The source data was collected by LDC from its collection platform or by donation and was manually reviewed to exclude material not in the target language or with sensitive content. Originally written in Arabizi (or Romanized/Latin characters) script, the source data was transliterated to Arabic script and manually corrected prior to treebank annotation.", + "Volume": "349,414", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information detection,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/bolt_information_retrieval_comprehensive_training_and_evaluation.json b/datasets/bolt_information_retrieval_comprehensive_training_and_evaluation.json new file mode 100644 index 0000000..5e74481 --- /dev/null +++ b/datasets/bolt_information_retrieval_comprehensive_training_and_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "BOLT Information Retrieval Comprehensive Training and Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "BOLT Information Retrieval Comprehensive Training and Evaluation contains the pilot, dry run, and evaluation data developed for each phase of the BOLT IR task, including: (1) natural-language IR queries, system responses to queries, and manually-generated assessment judgments for system responses; (2) discussion forum source documents in Arabic, Chinese and English; (3) scoring software for each evaluation phase; and (4) experimental data developed in Phase 2.", + "Volume": "nan", + "Unit": "nan", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/botta.json b/datasets/botta.json new file mode 100644 index 0000000..4c5783c --- /dev/null +++ b/datasets/botta.json @@ -0,0 +1,36 @@ +{ + "Name": "Botta", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/botta/", + "License": "custom", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "files available for chatbot systems", + "Volume": "nan", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "Botta: An Arabic Dialect Chatbot\r", + "Paper Link": "https://aclanthology.org/C16-2044.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialogue generation", + "Venue Title": "COLING", + "Citations": "14.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computational Linguistics", + "Authors": "Dana Abu Ali,Nizar Habash", + "Affiliations": ",", + "Abstract": "This paper presents BOTTA, the first Arabic dialect chatbot. We explore the challenges of creating a conversational agent that aims to simulate friendly conversations using the Egyptian Arabic dialect. We present a number of solutions and describe the different components of the BOTTA chatbot. The BOTTA database files are publicly available for researchers working on Arabic chatbot technologies. The BOTTA chatbot is also publicly available for any users who want to chat with it online.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/brad_1_0.json b/datasets/brad_1_0.json new file mode 100644 index 0000000..43c81ba --- /dev/null +++ b/datasets/brad_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "BRAD 1.0", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/BRAD", + "Link": "https://github.com/elnagara/BRAD-Arabic-Dataset", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The reviews were collected from GoodReads.com website during June/July 2016", + "Volume": "156,506", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Sharjah University", + "Derived From": "nan", + "Paper Title": "BRAD 1.0: Book reviews in Arabic dataset\r\n", + "Paper Link": "https://ieeexplore.ieee.org/abstract/document/7945800", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "review classification", + "Venue Title": "AICCSA", + "Citations": "32.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer Systems and Applications", + "Authors": "Ashraf Elnagar,Omar Einea", + "Affiliations": ",", + "Abstract": "The availability of rich datasets is a pre-requisite for proposing robust sentiment analysis systems. A variety of such datasets exists in English language. However, it is rare or nonexistent for the Arabic language except for a recent LABR dataset, which consists of a little bit over 63,000 book reviews extracted from. Goodreads. com. We introduce BRAD 1.0, the largest Book Reviews in Arabic Dataset for sentiment analysis and machine language applications. BRAD comprises of almost 510,600 book records. Each record corresponds for a single review and has the review in Arabic language and the reviewer's rating on a scale of 1 to 5 stars. In this paper, we present and describe the properties of BRAD. Further, we provide two versions of BRAD: the complete unbalanced dataset and the balanced version of BRAD. Finally, we implement four sentiment analysis classifiers based on this dataset and report our findings. When training and testing the classifiers on BRAD as opposed to LABR, an improvement rate growth of 46% is reported. The highest accuracy attained is 91%. Our core contribution is to make this benchmark-dataset available and accessible to the research community on Arabic language.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/brad_2_0.json b/datasets/brad_2_0.json new file mode 100644 index 0000000..24f1de6 --- /dev/null +++ b/datasets/brad_2_0.json @@ -0,0 +1,36 @@ +{ + "Name": "BRAD 2.0", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/BRAD", + "Link": "https://github.com/elnagara/BRAD-Arabic-Dataset", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling", + "Description": "extension to BRAD 1.0 with more than 200K extra records to account for several Arabic dialects.", + "Volume": "692,586", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Sharjah", + "Derived From": "nan", + "Paper Title": "An Annotated Huge Dataset for Standard and Colloquial Arabic Reviews for Subjective Sentiment Analysis", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S1877050918321781", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "review classification", + "Venue Title": "ACLING", + "Citations": "28.0", + "Venue Type": "conference", + "Venue Name": "nternational Conference on AI in Computational Linguistics", + "Authors": "Ashraf Elnagar,Leena Lulu,Omar Einea", + "Affiliations": ",,", + "Abstract": "Abstract Sentiment analysis is getting increasingly popular as it facilitates gaining an indication of the wider public opinions or attitudes towards certain products, services, articles, etc. Many researchers have shown considerable interest in this field. Most of these studies have focused on English and other Indo-European languages. Very few studies have addressed the problem for the Arabic language. This is, mostly, due to the rare or nonexistent huge and free Arabic datasets that contains both Modern Standard Arabic (MSA) as well as Dialectal Arabic (DA). Generally, one of the main challenges for developing robust sentiment analysis systems is the availability of such large-scale datasets. Such datasets exist in abundance for English language, while it is not the case for a low-resource language such as the Arabic language. Recently, there have been some efforts for providing relatively large-scale Arabic datasets dedicated for sentiment analysis such as LABR and most recently BRAD 1.0, which is considered as the largest Arabic Book Reviews dataset for sentiment analysis and machine learning applications. In this work, we present BRAD 2.0, an extension to BRAD 1.0 with more than 200K extra records to account for several Arabic dialects. BRAD 2.0 has a total number of 692586 annotated reviews; each represents a single review along with the reviewer\u2019s rating ranging from 1 to 5 of a certain book. The most interesting property of BRAD 2.0 is that it combines both MSA and DA. To verify and validate the proposed dataset, we implement several state-of-the-art supervised and unsupervised classifiers to categorize book reviews. For the unsupervised classifiers, we implemented several models of CNN and RNN classifiers utilizing GloVe-based word embeddings. Although all classifiers performed well, the highest accuracies attained are between 90% and 91%. Experimental results show that BRAD 2.0 is rich and robust. Our key contribution is to make this benchmark-dataset available and accessible to promote further research in the field of Arabic computational linguistic.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/buckwalter_arabic_morphological_analyzer_version_1_0.json b/datasets/buckwalter_arabic_morphological_analyzer_version_1_0.json new file mode 100644 index 0000000..0cbbf10 --- /dev/null +++ b/datasets/buckwalter_arabic_morphological_analyzer_version_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Buckwalter Arabic Morphological Analyzer Version 1.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2002L49", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The data consists primarily of three Arabic-English lexicon files: prefixes (299 entries), suffixes (618 entries), and stems (82,158 entries representing 38,600 lemmas). The lexicons are supplemented by three morphological compatibility tables used for controlling prefix-stem combinations (1,648 entries), stem-suffix combinations (1,285 entries), and prefix-suffix combinations (598 entries). The actual code for morphology analysis and POS tagging is contained in a Perl script. The documentation consists of a readme file with a description of the lexicon files, the morphological compatibility tables, the morphology analysis algorithm, a summary of stem morphological categories, and a table with the author's Arabic transliteration system.", + "Volume": "299", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/buckwalter_arabic_morphological_analyzer_version_2_0.json b/datasets/buckwalter_arabic_morphological_analyzer_version_2_0.json new file mode 100644 index 0000000..953da52 --- /dev/null +++ b/datasets/buckwalter_arabic_morphological_analyzer_version_2_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Buckwalter Arabic Morphological Analyzer Version 2.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2004L02", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The data consists primarily of three Arabic-English lexicon files: prefixes (299 entries), suffixes (618 entries), and stems (82158 entries representing 38600 lemmas). The lexicons are supplemented by three morphological compatibility tables used for controlling prefix-stem combinations (1648 entries), stem-suffix combinations (1285 entries), and prefix-suffix combinations (598 entries). The actual code for morphology analysis and POS tagging is contained in a Perl script. The documentation consists of a readme file with a description of the lexicon files, the morphological compatibility tables, the morphology analysis algorithm, a summary of stem morphological categories, and a table with the authors Arabic transliteration system.", + "Volume": "299", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "N/A $", + "Test Split": "No", + "Tasks": "machine translation,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/calima-glf.json b/datasets/calima-glf.json new file mode 100644 index 0000000..3ac09f9 --- /dev/null +++ b/datasets/calima-glf.json @@ -0,0 +1,36 @@ +{ + "Name": "CALIMA-GLF", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/unimorph/afb ", + "License": "CC BY-SA 3.0", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Domain": "social media", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The dataset is part of the CALIMAGLF morphological analyzer, focusing on Gulf Arabic verbs. It includes over 2,600 verbal lemmas with associated paradigms and lexical entries. The verbs were derived from phonetic dictionary entries, annotated, and expanded into inflected forms, including orthographic variants.", + "Volume": "2,648", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "A Morphological Analyzer for Gulf Arabic Verbs", + "Paper Link": "https://aclanthology.org/W17-1305.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "part of speech tagging, morphological analysis", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "The Third Arabic Natural Language Processing Workshop", + "Authors": "Salam Khalifa, Sara Hassan, Nizar Habash", + "Affiliations": "CAMeL Lab, New York University Abu Dhabi", + "Abstract": "We present CALIMAGLF, a Gulf Arabic morphological analyzer currently covering over 2,600 verbal lemmas. We describe in detail the process of building the analyzer starting from phonetic dictionary entries to fully inflected orthographic paradigms and associated lexicon and orthographic variants. We evaluate the coverage of CALIMAGLF against Modern Standard Arabic and Egyptian Arabic analyzers on part of a Gulf Arabic novel. CALIMAGLF verb analysis token recall for identifying correct POS tag outperforms both the Modern Standard Arabic and Egyptian Arabic analyzers by over 27.4% and 16.9% absolute, respectively.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/callfriend_egyptian_arabic.json b/datasets/callfriend_egyptian_arabic.json new file mode 100644 index 0000000..1c7f882 --- /dev/null +++ b/datasets/callfriend_egyptian_arabic.json @@ -0,0 +1,36 @@ +{ + "Name": "CALLFRIEND Egyptian Arabic", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC96S49", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The corpus consists of 60 unscripted telephone conversations, lasting between 5-30 minutes. The corpus also includes documentation describing speaker information (sex, age, education, callee telephone number) and call information (channel quality, number of speakers).", + "Volume": "25", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/callfriend_egyptian_arabic_second_edition.json b/datasets/callfriend_egyptian_arabic_second_edition.json new file mode 100644 index 0000000..38468b6 --- /dev/null +++ b/datasets/callfriend_egyptian_arabic_second_edition.json @@ -0,0 +1,36 @@ +{ + "Name": "CALLFRIEND Egyptian Arabic Second Edition", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2019S04", + "License": "LDC User Agreement for Non-Members", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "All data was collected before July 1997. Participants could speak with a person of their choice on any topic; most called family members and friends. All calls originated in North America. The recorded conversations last up to 30 minutes.", + "Volume": "0.5", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/callhome__egyptian_arabic_speech_translation_corpus.json b/datasets/callhome__egyptian_arabic_speech_translation_corpus.json new file mode 100644 index 0000000..1ff8e9a --- /dev/null +++ b/datasets/callhome__egyptian_arabic_speech_translation_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "CALLHOME: Egyptian Arabic Speech Translation Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CALLHOME", + "Link": "https://github.com/noisychannel/ARZ_callhome_corpus", + "License": "CC BY-SA 4.0", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "social media", + "Form": "text", + "Collection Style": "human translation", + "Description": "three-way parallel dataset of Egyptian Arabic Speech, transcriptions and English translations", + "Volume": "39,213", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "TRANSLATIONS OF THE CALLHOME EGYPTIAN ARABIC CORPUS FOR\nCONVERSATIONAL SPEECH TRANSLATION", + "Paper Link": "https://www.cis.upenn.edu/~ccb/publications/callhome-egyptian-arabic-speech-translations.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "other", + "Citations": "10.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "G. Kumar,Yuan Cao,Ryan Cotterell,Chris Callison-Burch,Daniel Povey,S. Khudanpur", + "Affiliations": ",Google Brain,,,,", + "Abstract": "Translation of the output of automatic speech recognition (ASR) systems, also known as speech translation, has received a lot of research interest recently. This is especially true for programs such as DARPA BOLT which focus on improving spontaneous human-human conversation across languages. However, this research is hindered by the dearth of datasets developed for this explicit purpose. For Egyptian Arabic-English, in particular, no parallel speechtranscription-translation dataset exists in the same domain. In order to support research in speech translation, we introduce the Callhome Egyptian Arabic-English Speech Translation Corpus. This supplements the existing LDC corpus with four reference translations for each utterance in the transcripts. The result is a three-way parallel dataset of Egyptian Arabic Speech, transcriptions and English translations.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/callhome_egyptian_arabic_speech.json b/datasets/callhome_egyptian_arabic_speech.json new file mode 100644 index 0000000..49e5921 --- /dev/null +++ b/datasets/callhome_egyptian_arabic_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "CALLHOME Egyptian Arabic Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC97S45", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "All calls, which lasted up to 30 minutes, originated in North America and were placed to locations overseas (typically Egypt). Most participants called family members or close friends.", + "Volume": "120", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/callhome_egyptian_arabic_speech_supplement.json b/datasets/callhome_egyptian_arabic_speech_supplement.json new file mode 100644 index 0000000..0632b8d --- /dev/null +++ b/datasets/callhome_egyptian_arabic_speech_supplement.json @@ -0,0 +1,36 @@ +{ + "Name": "CALLHOME Egyptian Arabic Speech Supplement", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2002S37", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "There are 20 data files in sphere format. The files are 8 KHz shorten-compressed two-channel mulaw. 12 of the files were recorded from domestic phone calls (both parties living in the continental U.S.), while the other eight are overseas calls (a participant in the U.S. called a friend or relative in Egypt or some other overseas country).", + "Volume": "20", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/callhome_egyptian_arabic_transcripts.json b/datasets/callhome_egyptian_arabic_transcripts.json new file mode 100644 index 0000000..ade433c --- /dev/null +++ b/datasets/callhome_egyptian_arabic_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "CALLHOME Egyptian Arabic Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC97T19", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "The transcripts are timestamped by speaker turn for alignment with the speech signal and are provided in standard orthography.", + "Volume": "120", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/callhome_egyptian_arabic_transcripts_supplement.json b/datasets/callhome_egyptian_arabic_transcripts_supplement.json new file mode 100644 index 0000000..854c45a --- /dev/null +++ b/datasets/callhome_egyptian_arabic_transcripts_supplement.json @@ -0,0 +1,36 @@ +{ + "Name": "CALLHOME Egyptian Arabic Transcripts Supplement", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2002T38", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "There are 40 data files. Each of the 20 calls has transcripts in two formats: .txt and .scr.", + "Volume": "40", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "750.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/calliar.json b/datasets/calliar.json new file mode 100644 index 0000000..ccfa6fb --- /dev/null +++ b/datasets/calliar.json @@ -0,0 +1,36 @@ +{ + "Name": "Calliar", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Calliar", + "Link": "https://github.com/ARBML/Calliar", + "License": "MIT License", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "images", + "Collection Style": "crawling and annotation(other)", + "Description": "Calliar is a dataset for Arabic calligraphy. The dataset consists of 2500 json files that contain strokes manually annotated for Arabic calligraphy. This repository contains the dataset for the following paper ", + "Volume": "2,500", + "Unit": "images", + "Ethical Risks": "Low", + "Provider": "ARBML", + "Derived From": "nan", + "Paper Title": "Calliar: An Online Handwritten Dataset for Arabic Calligraphy", + "Paper Link": "https://arxiv.org/pdf/2106.10745", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "calligraphy", + "Venue Title": "NCAA", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Neural Computing and Applications ", + "Authors": "Zaid Alyafeai, Maged S. Al-shaibani, Mustafa Ghaleb & Yousif Ahmed Al-Wajih ", + "Affiliations": "nan", + "Abstract": "Calligraphy is an essential part of the Arabic heritage and culture. It has been used in the past for the decoration of houses and mosques. Usually, such calligraphy is designed manually by experts with aesthetic insights. In the past few years, there has been a considerable effort to digitize such type of art by either taking a photograph of decorated buildings or drawing them using digital devices. The latter is considered an online form where the drawing is tracked by recording the apparatus movement, an electronic pen, for instance, on a screen. In the literature, there are many offline datasets with diverse Arabic styles for calligraphy. However, there is no available online dataset for Arabic calligraphy. In this paper, we illustrate our approach for collecting and annotating an online dataset for Arabic calligraphy called Calliar, which consists of 2,500 sentences. Calliar is annotated for stroke, character, word, and sentence-level prediction. We also propose various baseline models for the character classification task. The results we achieved highlight that it is still an open problem.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/calyou.json b/datasets/calyou.json new file mode 100644 index 0000000..fc0c555 --- /dev/null +++ b/datasets/calyou.json @@ -0,0 +1,36 @@ +{ + "Name": "CALYOU", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CAYLOU", + "Link": "https://github.com/abidikarima/CALYOU", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "A Comparable Spoken Algerian Corpus Harvested from YouTube", + "Volume": "5,190", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "CALYOU: A Comparable Spoken Algerian Corpus Harvested from YouTube\r", + "Paper Link": "https://hal.archives-ouvertes.fr/hal-01531591/document", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "INTERSPEECH", + "Citations": "16.0", + "Venue Type": "conference", + "Venue Name": "Conference of the International Speech Communication Association", + "Authors": "K. Abidi,M. Menacer,Kamel Sma\u00efli", + "Affiliations": ",,", + "Abstract": "This paper addresses the issue of comparability of comments \n extracted from Youtube. The comments concern spoken \n Algerian which could be either local Arabic, Modern Standard \n Arabic or French. This diversity of expression arises a huge \n number of problems concerning the data processing. In this \n article, several methods of alignment will be proposed and \n tested. The method which permits to best align is Word2Vec-based \n approach that will be used iteratively. This recurrent \n call of Word2Vec allows to improve significantly the results \n of comparability. In fact, a dictionary-based approach leads \n to a Recall of 4, while our approach allows to get a Recall of \n 33 at rank 1. Thanks to this approach, we built from Youtube \n CALYOU, a Comparable Corpus of the spoken Algerian.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/cameltb__camel_treebank_1_0.json b/datasets/cameltb__camel_treebank_1_0.json new file mode 100644 index 0000000..7a99560 --- /dev/null +++ b/datasets/cameltb__camel_treebank_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "CamelTB: Camel Treebank 1.0", + "Subsets": [], + "HF Link": "nan", + "Link": "http://treebank.camel-lab.com/", + "License": "custom", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "manual curation", + "Description": "the Camel Treebank (CAMELTB) is a 188K word open-source dependency treebank of Modern Standard and Classical Arabic. It includes 13 sub-corpora comprising selections of texts from pre-Islamic poetry to social media online commentaries, and covering a range of genres from religious and philosophical texts to news, novels, and student essays.", + "Volume": "242,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "Camel Treebank: An Open Multi-genre Arabic Dependency Treebank", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.286.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "part of speech tagging, morphological analysis, dependency parsing", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Nizar Habash, Muhammed AbuOdeh, Dima Taji, Reem Faraj, Jamila El Gizuli, Omar Kallas", + "Affiliations": "New York University Abu Dhabi", + "Abstract": "We present the Camel Treebank (CAMELTB), a 188K word open-source dependency treebank of Modern Standard and Classical\nArabic. CAMELTB 1.0 includes 13 sub-corpora comprising selections of texts from pre-Islamic poetry to social media online\ncommentaries, and covering a range of genres from religious and philosophical texts to news, novels, and student essays. The\ntexts are all publicly available (out of copyright, creative commons, or under open licenses). The texts were morphologically\ntokenized and syntactically parsed automatically, and then manually corrected by a team of trained annotators. The annotations\nfollow the guidelines of the Columbia Arabic Treebank (CATiB) dependency representation. We discuss our annotation process\nand guideline extensions, and we present some initial observations on lexical and syntactic differences among the annotated\nsub-corpora. This corpus will be publicly available to support and encourage research on Arabic NLP in general and on new,\npreviously unexplored genres that are of interest to a wider spectrum of researchers, from historical linguistics and digital\nhumanities to computer-assisted language pedagogy.", + "Added By": "Nizar Habash" +} \ No newline at end of file diff --git a/datasets/canercorpus.json b/datasets/canercorpus.json new file mode 100644 index 0000000..8e6735d --- /dev/null +++ b/datasets/canercorpus.json @@ -0,0 +1,36 @@ +{ + "Name": "CANERCorpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/caner", + "Link": "https://github.com/RamziSalah/Classical-Arabic-Named-Entity-Recognition-Corpus", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "It is freely available and manual annotation by human experts, containing more than 7,000 Hadiths", + "Volume": "72,108", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Universiti Kebangsaan", + "Derived From": "nan", + "Paper Title": "Building the Classical Arabic Named Entity Recognition Corpus (CANERCorpus)", + "Paper Link": "https://ieeexplore.ieee.org/document/8464820/authors#authors", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "CAMP", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Information Retrieval and Knowledge Management", + "Authors": "Ramzi Salah,Lailatul Qadri Binti Zakaria", + "Affiliations": ",", + "Abstract": "The past decade has witnessed construction of the background information resources to overcome several challenges in text mining tasks. For non-English languages with poor knowledge sources such as Arabic, these challenges have become more salient especially for handling the natural language processing applications that require human annotation. In the Named Entity Recognition (NER) task, several researches have been introduced to address the complexity of Arabic in terms of morphological and syntactical variations. However, there are a small number of studies dealing with Classical Arabic (CA) that is the official language of Quran and Hadith. CA was also used for archiving the Islamic topics that contain a lot of useful information which could of great value if extracted. Therefore, in this paper, we introduce Classical Arabic Named Entity Recognition corpus as a new corpus of tagged data that can be useful for handling the issues in recognition of Arabic named entities. It is freely available and manual annotation by human experts, containing more than 7,000 Hadiths. Based on Islamic topics, we classify named entities into 20 types which include the specific-domain entities that have not been handled before such as Allah, Prophet, Paradise, Hell, and Religion. The differences between the standard and classical Arabic are described in details during this work. Moreover, the comprehensive statistical analysis is introduced to measure the factors that play important role in manual human annotation.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/cc-100.json b/datasets/cc-100.json new file mode 100644 index 0000000..d18d50d --- /dev/null +++ b/datasets/cc-100.json @@ -0,0 +1,36 @@ +{ + "Name": "CC-100", + "Subsets": [], + "HF Link": "https://hf.co/datasets/statmt/cc100", + "Link": "https://data.statmt.org/cc-100/", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "monolingual datasets from Common Crawl for a variety of languages", + "Volume": "7,132,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Facebook", + "Derived From": "Common Crawl", + "Paper Title": "CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data", + "Paper Link": "https://aclanthology.org/2020.lrec-1.494.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary,\nFrancisco Guzm\u00e1n, Armand Joulin, Edouard Grave", + "Affiliations": "Facebook AI", + "Abstract": "Pre-training text representations have led to significant improvements in many areas of natural language processing. The\nquality of these models benefits greatly from the size of the pretraining corpora as long as its quality is preserved. In this\npaper, we describe an automatic pipeline to extract massive high-quality monolingual datasets from Common Crawl for\na variety of languages. Our pipeline follows the data processing introduced in fastText (Mikolov et al., 2017; Grave et al.,\n2018), that deduplicates documents and identifies their language. We augment this pipeline with a filtering step to select\ndocuments that are close to high quality corpora like Wikipedia", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ccaligned.json b/datasets/ccaligned.json new file mode 100644 index 0000000..0109201 --- /dev/null +++ b/datasets/ccaligned.json @@ -0,0 +1,36 @@ +{ + "Name": "CCAligned", + "Subsets": [], + "HF Link": "https://hf.co/datasets/ahelk/ccaligned_multilingual", + "Link": "https://opus.nlpl.eu/CCAligned.php", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. ", + "Volume": "1,219,374", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "CCAligned: A Massive Collection of Cross-lingual Web-Document Pairs", + "Paper Link": "http://aclanthology.lst.uni-saarland.de/2020.emnlp-main.480.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "EMNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Empirical Methods in Natural Language Processing", + "Authors": "Ahmed El-Kishky, Vishrav Chaudhary, Francisco Guzm\u00e1, Philipp Koehn", + "Affiliations": "Facebook AI; Facebook AI; Facebook AI; Johns Hopkins University", + "Abstract": "Cross-lingual document alignment aims to\nidentify pairs of documents in two distinct languages that are of comparable content or translations of each other. In this paper, we exploit the signals embedded in URLs to label\nweb documents at scale with an average precision of 94.5% across different language pairs.\nWe mine sixty-eight snapshots of the Common Crawl corpus and identify web document\npairs that are translations of each other. We\nrelease a new web dataset consisting of over\n392 million URL pairs from Common Crawl\ncovering documents in 8144 language pairs\nof which 137 pairs include English. In addition to curating this massive dataset, we introduce baseline methods that leverage crosslingual representations to identify aligned documents based on their textual content. Finally,\nwe demonstrate the value of this parallel documents dataset through a downstream task of\nmining parallel sentences and measuring the\nquality of machine translations from models\ntrained on this mined data. Our objective in\nreleasing this dataset is to foster new research\nin cross-lingual NLP across a variety of low,\nmedium, and high-resource languages", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ccmatrix.json b/datasets/ccmatrix.json new file mode 100644 index 0000000..c69f467 --- /dev/null +++ b/datasets/ccmatrix.json @@ -0,0 +1,36 @@ +{ + "Name": "CCMatrix", + "Subsets": [], + "HF Link": "https://hf.co/datasets/yhavinga/ccmatrix", + "Link": "https://github.com/facebookresearch/LASER/tree/main/tasks/CCMatrix", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": " 80 languages, we were able to mine 10.8 billion parallel sentences, out of which only 2.9 billion are aligned with English", + "Volume": "196,000,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Facebook", + "Derived From": "nan", + "Paper Title": "CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB", + "Paper Link": "https://arxiv.org/pdf/1911.04944.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave, Armand Joulin\n", + "Affiliations": "Facebook AI", + "Abstract": "We show that margin-based bitext mining in a\nmultilingual sentence space can be applied to\nmonolingual corpora of billions of sentences.\nWe are using ten snapshots of a curated common crawl corpus (Wenzek et al., 2019), totalling 32.7 billion unique sentences. Using\none unified approach for 38 languages, we\nwere able to mine 4.5 billions parallel sentences, out of which 661 million are aligned\nwith English. 20 language pairs have more\nthen 30 million parallel sentences, 112 more\nthen 10 million, and most more than one\nmillion, including direct alignments between\nmany European or Asian languages.\nTo evaluate the quality of the mined bitexts,\nwe train NMT systems for most of the language pairs and evaluate them on TED, WMT\nand WAT test sets. Using our mined bitexts\nonly and no human translated parallel data, we\nachieve a new state-of-the-art for a single system on the WMT\u201919 test set for translation between English and German, Russian and Chinese, as well as German/French. In particular, our English/German system outperforms\nthe best single one by close to 4 BLEU points\nand is almost on pair with best WMT\u201919 evaluation system which uses system combination and back-translation. We also achieve excellent results for distant languages pairs like\nRussian/Japanese, outperforming the best submission at the 2019 workshop on Asian Translation (WAT).", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ceap.json b/datasets/ceap.json new file mode 100644 index 0000000..0c469c9 --- /dev/null +++ b/datasets/ceap.json @@ -0,0 +1,36 @@ +{ + "Name": "CEAP", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CEAP", + "Link": "https://sourceforge.net/projects/ceap-bp/", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "It contains 50 TXT files recording poetry composition by various authors from 6th and 7th c. It was derived from two corpora: King Saud University Classical Arabic Corpus (KSUCAC) created by the team led by Maha S. Alrabiah (2014), and a corpus prepared by Abeer Alsheddi (2016).", + "Volume": "50", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "Alrabiah, Maha S. (2014): \tKing Saud University Classical Arabic Corpus, Ar-Riya\u0304\u1e0d. Alsheddi, Abeer. (2016): \tEdit Distance Adapted to Natural Language Words. M.A. Thesis. Ar-Riya\u0304\u1e0d.", + "Paper Title": "Cultural Conceptualizations of shame & dishonor in Early Poetic Arabic (EPA)", + "Paper Link": "https://www.ejournals.eu/pliki/art/20227/", + "Script": "Latn", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "cultural conceptualizations of shame and dishonor", + "Venue Title": "The Polish Journal of the Arts and Culture", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "The Polish Journal of the Arts and Culture. New Series}", + "Authors": "Bartosz Pietrzak", + "Affiliations": "Institute of Oriental Studies of Jagiellonian University in Krakow.", + "Abstract": "Persisting in a binary relationship with honor, shame was an important element of the pre-Islamic Arabic social evaluation system. In my\nstudy, I analyzed the two most important EPA concepts parallel to\nEnglish shame \u2013 \u02c1ayb and \u02c1a\u0304r \u2013 applying the Cultural Linguistic approach. Based on the analyses on corpus of Early Arabic poetry and\nClassical Arabic dictionaries, I represented cultural schemata encoding the knowledge shared by pre-Islamic Arabs about those phenomena. The paper presents also metaphoric, metonymic, and image-schematic models, which account for the specifics of associated linguistic\nframes. Moreover, I posit a hypothesis on the existence of a schema\nsubsuming the honor- and shame-dishonor-related schemata in\nform of social evaluation of usefulness, which seems to correspond\nto the historical and linguistic data.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/checkthat-ar.json b/datasets/checkthat-ar.json new file mode 100644 index 0000000..7a6fd1a --- /dev/null +++ b/datasets/checkthat-ar.json @@ -0,0 +1,36 @@ +{ + "Name": "CheckThat-AR", + "Subsets": [], + "HF Link": "nan", + "Link": "https://gitlab.com/bigirqu/checkthat-ar/", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "check-worthiness datasets", + "Volume": "7,500", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Overview of CheckThat! 2020 Arabic:", + "Paper Link": "http://www.dei.unipd.it/~ferro/CLEF-WN-Drafts/CLEF2020/paper_257.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "claim verification", + "Venue Title": "CLEF", + "Citations": "9.0", + "Venue Type": "conference", + "Venue Name": "Conference and Labs of the Evaluation Forum", + "Authors": "Maram Hasanain,Fatima Haouari,Reem Suwaileh,Zien Sheikh Ali,Bayan Hamdan,Tamer Elsayed,Alberto Barr\u00f3n-Cede\u00f1o,Giovanni Da San Martino,Preslav Nakov", + "Affiliations": ",,,,,,,Qatar Computing Research Institute,", + "Abstract": "In this paper, we make freely accessible ANETAC1 our English-Arabic named entity transliteration and\r\nclassification dataset that we built from freely available parallel translation corpora. The dataset contains\r\n79, 924 instances, each instance is a triplet (e, a, c), where e is the English named entity, a is its Arabic\r\ntransliteration and c is its class that can be either a Person, a Location, or an Organization. The ANETAC\r\ndataset is mainly aimed for the researchers that are working on Arabic named entity transliteration, but it can\r\nalso be used for named entity classification purposes. This dataset was developed and used as part of a previous\r\nresearch study done by Hadj Ameur et al.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/childes_egyptian_arabic_salama_corpus.json b/datasets/childes_egyptian_arabic_salama_corpus.json new file mode 100644 index 0000000..cdb571e --- /dev/null +++ b/datasets/childes_egyptian_arabic_salama_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "CHILDES Egyptian Arabic Salama Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://childes.talkbank.org/access/Other/Arabic/Salama.html", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Participants The Egyptian Arabic corpus includes data from ten children. Five boys and five girls were selected randomly with no language delay from a nursery in Alexandria. All children were normal and their first language is Arabic. The children ranged in age from 1;7 to 3;8 years (mean age2.77) studied cross-sectionally. 7 Children were visited in their kindergarten and 3 children at their home. The total number of utterances for all 10 children is 25,645. The adult part of the corpus contains 14,868 adult utterances 2,518 from the mother, 12,350 from the investigator and, 10,777 from children.. Data Collection A speech sample was collected based on spontaneous speech in unstructured interview. Data were elicited through conversation, naming objects, pictures around the child in his/her environment and use anything that children normally use rather than something new, and describe what they were doing while playing. We encourage natural interaction to include all styles, such as sitting with a child in the class, playing with the child, interacting with a mother, and/or teacher during the interaction and teaching process. The interview is increasingly semi-structured when a child is able to produce morphemes: for example, when a child produces a singular noun, the investigator, and/or a mother asked him about plural competence. Data was collected from 6 children in a nursery by the investigator, one child at home with a mother and an investigator, and 2 children at home by mothers. Audio recording of spontaneous speech produced by children is obtained in natural settings, in a child home or a kindergarten.", + "Volume": "nan", + "Unit": "hours", + "Ethical Risks": "nan", + "Provider": "Alexandria University", + "Derived From": "nan", + "Paper Title": "Building a spoken Arabic corpus for Egyptian children: data collection and transcription. Master's thesis. Alexandria University.", + "Paper Link": "https://www.academia.edu/44521353/Building_a_spoken_Arabic_corpus_for_Egyptian_children_Data_collection_and_transcription", + "Script": "Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Heba Salama", + "Affiliations": "Alexandria University", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/ciad__corpus_of_iraqi_arabic_dialect.json b/datasets/ciad__corpus_of_iraqi_arabic_dialect.json new file mode 100644 index 0000000..d6ead62 --- /dev/null +++ b/datasets/ciad__corpus_of_iraqi_arabic_dialect.json @@ -0,0 +1,36 @@ +{ + "Name": "CIAD: Corpus of Iraqi Arabic Dialect", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Iraqi_Dialect", + "Link": "https://github.com/ebady/Iraqi-Arabic-Dialect-Dataset", + "License": "CC0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "The corpus has been collected, annotated and made publicly accessible to other researchers for sentiment analysis research.", + "Volume": "1,170", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Constructing twitter corpus of Iraqi Arabic Dialect (CIAD) for sentiment analysis", + "Paper Link": "https://ntv.ifmo.ru/file/article/21138.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "STJITMO", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Scientific and Technical Journal of Information Technologies, Mechanics and Optics", + "Authors": "Mohammed M. Hassoun Al-Jawad, Hasanein Alharbi, Ahmed F. Almukhtar, Anwar Adnan Alnawas", + "Affiliations": "University of Kerbala, University of Babylon, and Southern Technical University - Iraq", + "Abstract": "The number of Twitter users in Iraq has increased significantly in recent years. Major events, the political situation in the country, had a significant impact on the content of Twitter and affected the tweets of Iraqi users. Creating an Iraqi Arabic Dialect corpus is crucial for sentiment analysis to study such behaviors. Since no such corpus existed, this paper introduces the Corpus of Iraqi Arabic Dialect (CIAD). The corpus has been collected, annotated and made publicly accessible to other researchers for further investigation. Furthermore, the created corpus has been validated using eight different combinations of four feature-selections approaches and two versions of Support Vector Machine (SVM) algorithm. Various performance measures were calculated. The obtained accuracy, 78 %, indicates a promising potential application.", + "Added By": "Mourad Mars" +} \ No newline at end of file diff --git a/datasets/cidar.json b/datasets/cidar.json new file mode 100644 index 0000000..41c8f7a --- /dev/null +++ b/datasets/cidar.json @@ -0,0 +1,36 @@ +{ + "Name": "CIDAR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CIDAR", + "Link": "https://hf.co/datasets/arbml/CIDAR", + "License": "CC BY-NC 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "CIDAR contains 10,000 instructions and their output. The dataset was created by selecting around 9,109 samples from Alpagasus dataset then translating it to Arabic using ChatGPT. In addition, we append that with around 891 Arabic grammar instructions from the webiste Ask the teacher. All the 10,000 samples were reviewed by around 12 reviewers.", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "ARBML", + "Derived From": "AlpaGasus", + "Paper Title": "CIDAR: Culturally Relevant Instruction Dataset For Arabic", + "Paper Link": "https://arxiv.org/pdf/2402.03177.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "instruction tuning", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Zaid Alyafeai, Khalid Almubarak, Ahmed Ashraf, Deema Alnuhait, Saied Alshahrani, Gubran A. Q. Abdulrahman, Gamil Ahmed, Qais Gawah, Zead Saleh, Mustafa Ghaleb, Yousef Ali, Maged S. Al-Shaibani", + "Affiliations": "nan", + "Abstract": "Instruction tuning has emerged as a prominent methodology for teaching Large Language Models (LLMs) to follow instructions.\nHowever, current instruction datasets predominantly cater to English or are derived from\nEnglish-dominated LLMs, resulting in inherent\nbiases toward Western culture. This bias significantly impacts the linguistic structures of\nnon-English languages such as Arabic, which\nhas a distinct grammar reflective of the diverse cultures across the Arab region. This\npaper addresses this limitation by introducing CIDAR1\nthe first open Arabic instructiontuning dataset culturally-aligned by human reviewers. CIDAR contains 10,000 instruction\nand output pairs that represent the Arab region. We discuss the cultural relevance of\nCIDAR via the analysis and comparison to other\nmodels fine-tuned on other datasets. Our experiments show that CIDAR can help enrich\nresearch efforts in aligning LLMs with the\nArabic culture. All the code is avail", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/clartts__an_open-source_classical_arabic_text-to-speech_corpus.json b/datasets/clartts__an_open-source_classical_arabic_text-to-speech_corpus.json new file mode 100644 index 0000000..d36b5ef --- /dev/null +++ b/datasets/clartts__an_open-source_classical_arabic_text-to-speech_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "ClArTTS: An Open-Source Classical Arabic Text-to-Speech Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/MBZUAI/ClArTTS", + "Link": "https://hf.co/datasets/MBZUAI/ClArTTS", + "License": "CC BY 4.0", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "ClArTTS is a speech Classical Arabic corpus designed for E2E TTS systems. The speech is extracted from a LibriVox audiobook, which is then processed, segmented, and manually transcribed and annotated. It contains about 12 hours of speech from a single male speaker sampled at 40100 Hz.", + "Volume": "12", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Mohamed Bin Zayed University of Artificial Intelligence", + "Derived From": "LibriVox", + "Paper Title": "ClArTTS: An Open-Source Classical Arabic Text-to-Speech Corpus", + "Paper Link": "https://www.isca-archive.org/interspeech_2023/kulkarni23_interspeech.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition", + "Venue Title": "INTERSPEECH", + "Citations": "5.0", + "Venue Type": "conference", + "Venue Name": "Interspeech", + "Authors": "Ajinkya Kulkarni, Atharva Kulkarni, Sara Abedalmon\u2019em Mohammad Shatnawi, Hanan Aldarmaki", + "Affiliations": "MBZUAI, Erisha Labs", + "Abstract": "We present a Classical Arabic Text-to-Speech (ClArTTS) corpus to facilitate the development of end-to-end TTS systems for the Arabic language. The speech is extracted from a LibriVox audiobook, which is then processed, segmented, and manually transcribed and annotated. The ClArTTS corpus contains about 12 hours of speech from a single male speaker sampled at 40100 Hz. In this paper, we describe the process of corpus creation, details of corpus statistics, and a comparison with existing resources. Furthermore, we develop two TTS systems based on Grad-TTS and Glow-TTS and illustrate the performance of the resulting systems via subjective and objective evaluations.", + "Added By": "Karima Kadaoui" +} \ No newline at end of file diff --git a/datasets/classical_arabic_dictionary.json b/datasets/classical_arabic_dictionary.json new file mode 100644 index 0000000..fd62dba --- /dev/null +++ b/datasets/classical_arabic_dictionary.json @@ -0,0 +1,36 @@ +{ + "Name": "Classical Arabic Dictionary", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2021L01", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "The dictionary is presented in three formats: plain text in UTF-8 encoding, plain text in CP1256 encoding, and as an SQL database file. Source documents are presented in UTF-8 and CP1256 encodings.", + "Volume": "nan", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "250.00 $", + "Test Split": "No", + "Tasks": "information retrieval,historical linguistics,language generation,language modeling,morphology", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/cleananercorp.json b/datasets/cleananercorp.json new file mode 100644 index 0000000..504b37c --- /dev/null +++ b/datasets/cleananercorp.json @@ -0,0 +1,36 @@ +{ + "Name": "CLEANANERCorp", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CLEANANERCorp", + "Link": "https://github.com/iwan-rg/CLEANANERCorp", + "License": "LGPL-3.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "we conducted empirical research to understand the errors in ANERcorp, correct them and propose a cleaner version of the dataset named CLEANANERCorp. CLEANANERCorp will serve the research community as a more accurate and consistent benchmark", + "Volume": "150,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "IWAN", + "Derived From": "ANERcorp", + "Paper Title": "CLEANANERCorp: Identifying and Correcting Incorrect Labels in the ANERcorp Dataset", + "Paper Link": "https://aclanthology.org/2024.osact-1.2.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "OSACT", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Mashael AlDuwais, Hend Al-Khalifa and Abdulmalik AlSalman", + "Affiliations": "nan", + "Abstract": "Label errors are a common issue in machine learning datasets, particularly for tasks such as Named Entity Recognition.\nSuch label errors might hurt model training, affect evaluation results, and lead to an inaccurate assessment of model\nperformance. In this study, we dived deep into one of the widely adopted Arabic NER benchmark datasets (ANERcorp)\nand found a significant number of annotation errors, missing labels, and inconsistencies. Therefore, in this study, we\nconducted empirical research to understand these errors, correct them and propose a cleaner version of the dataset\nnamed CLEANANERCorp. CLEANANERCorp will serve the research community as a more accurate and consistent\nbenchmark.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/coda__(conventional_orthography_for_dialectal_arabic).json b/datasets/coda__(conventional_orthography_for_dialectal_arabic).json new file mode 100644 index 0000000..3662462 --- /dev/null +++ b/datasets/coda__(conventional_orthography_for_dialectal_arabic).json @@ -0,0 +1,36 @@ +{ + "Name": "CODA* (Conventional Orthography for Dialectal Arabic)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/CAMeL-Lab/camel-guidelines/edit/master/docs/orthography.md", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2011, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "CODA* is a unified orthographic system for Arabic dialects designed for computational purposes. It maintains dialectal uniqueness while aligning with some conventions of Modern Standard Arabic (MSA).", + "Volume": "nan", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "Manually derived from phonological, morphological, and syntactic studies of dialects.", + "Paper Title": "A Spelling Correction Corpus for Multiple Arabic Dialects", + "Paper Link": "https://aclanthology.org/L16-1006.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "CAMeL Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "morphological analysis", + "Venue Title": "LREC (Language Resources and Evaluation Conference)", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "LREC (Language Resources and Evaluation Conference)", + "Authors": "Nizar Habash (Project Lead), Fadhl Eryani (Current Maintainer), and others from NYUAD and collaborating universities.", + "Affiliations": "New York University Abu Dhabi (NYUAD), Birzeit University, Universit\u00e9 de Sfax, Carnegie Mellon University in Qatar, and more.", + "Abstract": "nan", + "Added By": "Maryam Al Emadi " +} \ No newline at end of file diff --git a/datasets/commonlanguage.json b/datasets/commonlanguage.json new file mode 100644 index 0000000..5712b81 --- /dev/null +++ b/datasets/commonlanguage.json @@ -0,0 +1,36 @@ +{ + "Name": "CommonLanguage", + "Subsets": [], + "HF Link": "https://hf.co/datasets/anton-l/common_language", + "Link": "https://github.com/speechbrain/speechbrain/tree/develop/recipes/CommonLanguage", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "This dataset is composed of speech recordings from languages that were carefully selected from the CommonVoice database. The total duration of audio recordings is 45.1 hours (i.e., 1 hour of material for each language). The dataset has been extracted from CommonVoice to train language-id systems.", + "Volume": "1", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": " SpeechBrain", + "Derived From": "CommonVoice", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/commonsense_validation.json b/datasets/commonsense_validation.json new file mode 100644 index 0000000..7270cd7 --- /dev/null +++ b/datasets/commonsense_validation.json @@ -0,0 +1,36 @@ +{ + "Name": "Commonsense validation", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Commonsense_Validation", + "Link": "https://github.com/msmadi/Arabic-Dataset-for-Commonsense-Validationion", + "License": "CC BY-SA 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "a benchmark Arabic dataset for commonsense understanding and validation", + "Volume": "12,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Jordan University ", + "Derived From": "nan", + "Paper Title": "Is this sentence valid? An Arabic Dataset for Commonsense Validation\r\n", + "Paper Link": "https://arxiv.org/abs/2008.10873", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "commonsense validation", + "Venue Title": "ArXiv", + "Citations": "1.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Saja Khaled Tawalbeh,Mohammad Al-Smadi", + "Affiliations": ",", + "Abstract": "The commonsense understanding and validation remains a challenging task in the field of natural language understanding. Therefore, several research papers have been published that studied the capability of proposed systems to evaluate the models ability to validate commonsense in text. In this paper, we present a benchmark Arabic dataset for commonsense understanding and validation as well as a baseline research and models trained using the same dataset. To the best of our knowledge, this dataset is considered as the first in the field of Arabic text commonsense validation. The dataset is distributed under the Creative Commons BY-SA 4.0 license and can be found on GitHub.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/comparable_wikipedia_coprus.json b/datasets/comparable_wikipedia_coprus.json new file mode 100644 index 0000000..af1478f --- /dev/null +++ b/datasets/comparable_wikipedia_coprus.json @@ -0,0 +1,49 @@ +{ + "Name": "Comparable Wikipedia Coprus", + "Subsets": [ + { + "Name": "Arabic Wikipedia", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "10,197", + "Unit": "documents" + }, + { + "Name": "Egyptian Wikipedia", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "10,197", + "Unit": "documents" + } + ], + "HF Link": "https://hf.co/datasets/arbml/comparable_arabizi", + "Link": "https://github.com/motazsaad/comparableWikiCoprus", + "License": "CC BY-SA 4.0", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Comparable Wikipedia Corpus (aligned documents) Corpus extracts from 20-01-2017 Wikipedia dumps", + "Volume": "20,394", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Islamic University of Gaza", + "Derived From": "nan", + "Paper Title": "WikiDocsAligner: An Off-the-Shelf Wikipedia Documents Alignment Tool", + "Paper Link": "https://ieeexplore.ieee.org/document/8038320", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "PICICT", + "Citations": "4.0", + "Venue Type": "conference", + "Venue Name": "Palestinian International Conference on Information and Communication Technology", + "Authors": "Motaz Saad,B. Alijla", + "Affiliations": "The Islamic University of Gaza,", + "Abstract": "Wikipedia encyclopedia is an attractive source for comparable corpora in many languages. Most researchers develop their own script to perform document alignment task, which requires efforts and time. In this paper, we present WikiDocsAligner, an off-the-shelf Wikipedia Articles alignment handy tool. The implementation of WikiDocsAligner does not require the researchers to import/export of interlanguage links databases. The user just need to download Wikipedia dumps (interlanguage links and articles), then provide them to the tool, which performs the alignment. This software can be used easily to align Wikipedia documents in any language pair. Finally, we use WikiDocsAligner to align comparable documents from Arabic Wikipedia and Egyptian Wikipedia. So we shed the light on Wikipedia as a source of Arabic dialects language resources. The produced resources is interesting and useful as the demand on Arabic/dialects language resources increased in the last decade.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/comprehensive_word_lists_for_chinese,_japanese,_korean_and_arabic.json b/datasets/comprehensive_word_lists_for_chinese,_japanese,_korean_and_arabic.json new file mode 100644 index 0000000..aa7cbe4 --- /dev/null +++ b/datasets/comprehensive_word_lists_for_chinese,_japanese,_korean_and_arabic.json @@ -0,0 +1,36 @@ +{ + "Name": "Comprehensive Word Lists for Chinese, Japanese, Korean and Arabic", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-M0071/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Comprehensive monolingual word lists for both Simplified and Traditional Chinese, Japanese, Korean and Arabic, including a full-form Arabic word list. For Simplified and Traditional Chinese, Japanese and Korean, we provide readings as well, making them ideal for speech-related applications such as speech synthesis. The two Arabic databases include both vocalized and romanized Arabic.", + "Volume": "nan", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "37,500.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/cormap__corpus_for_moroccan_arabic_processing.json b/datasets/cormap__corpus_for_moroccan_arabic_processing.json new file mode 100644 index 0000000..f426848 --- /dev/null +++ b/datasets/cormap__corpus_for_moroccan_arabic_processing.json @@ -0,0 +1,36 @@ +{ + "Name": "CORMAP: Corpus for Moroccan Arabic Processing", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CORMAP", + "Link": "https://lindat.mff.cuni.cz/repository/xmlui/handle/11372/LRT-3551", + "License": "LGPL-3.0", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This resource is a corpus containing 34k Moroccan Colloquial Arabic sentences collected from different sources. The sentences are written in Arabic letters. This resource can be useful in some NLP applications such as Language Identification.", + "Volume": "34,000", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "LINDAT/CLARIAH-CZ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "tachicart ridouane ,bouzoubaa, karim", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/coronavirus.json b/datasets/coronavirus.json new file mode 100644 index 0000000..e8c4da1 --- /dev/null +++ b/datasets/coronavirus.json @@ -0,0 +1,36 @@ +{ + "Name": "Coronavirus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CoronaVirus", + "Link": "https://github.com/aseelad/Coronavirus-Public-Arabic-Twitter-Data-Set/", + "License": "CC BY-NC-SA 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "other", + "Description": "contains data collected from December 1st 2019 until April 11th 2020", + "Volume": "3,800,856", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Imam Mohammad Bin Saud University", + "Derived From": "nan", + "Paper Title": "Coronavirus: Public Arabic Twitter Dataset\r", + "Paper Link": "https://www.preprints.org/manuscript/202004.0263/v1", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "other", + "Citations": "3.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Aseel Addawood", + "Affiliations": "nan", + "Abstract": "The COVID-19 pandemic spread of the coronavirus across the globe has affected our lives on many different levels. The world we knew before the spread of the virus has become another one. Every country has taken preventive measures, including social distancing, travel restrictions, and curfew, to control the spread of the disease. With these measures implemented, people have shifted to social media platforms in the online sphere, such as Twitter, to maintain connections. In this paper, we describe a coronavirus data set of Arabic tweets collected from January 1, 2020, primarily from hashtags populated from Saudi Arabia. This data set is available to the research community to glean a better understanding of the societal, economical, and political effects of the outbreak and to help policy makers make better decisions for fighting this epidemic.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json b/datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json new file mode 100644 index 0000000..e0173b9 --- /dev/null +++ b/datasets/corpora_for_egyptian_arabic_and_gulf_arabic_from_twitter.json @@ -0,0 +1,49 @@ +{ + "Name": "Corpora for Egyptian Arabic and Gulf Arabic from Twitter", + "Subsets": [ + { + "Name": "GLF", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "150,000", + "Unit": "sentences" + }, + { + "Name": "EGY", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "150,000", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/Egyptial_Gulf_Twitter_Dataset", + "Link": "https://github.com/telsahy/capstone-34", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "An evenly balanced dataset of Arabic dialects, Egyptian and Gulf using a variety of dialectal terms.", + "Volume": "300,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Building Datasets for Dialect Classifiers using Twitter", + "Paper Link": "https://github.com/telsahy/capstone-34/blob/master/Building%20Datasets%20for%20Arabic%20Dialect%20Classifiers%20using%20Twitter.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Tamir ElSahy", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/corpus_of_offensive_language_in_arabic.json b/datasets/corpus_of_offensive_language_in_arabic.json new file mode 100644 index 0000000..73ba0e0 --- /dev/null +++ b/datasets/corpus_of_offensive_language_in_arabic.json @@ -0,0 +1,36 @@ +{ + "Name": "Corpus of Offensive Language in Arabic", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Corpus_of_Offensive_Language_in_Arabic", + "Link": "https://onedrive.live.com/?authkey=!ACDXj_ZNcZPqzy0&id=6EF6951FBF8217F9!105&cid=6EF6951FBF8217F9", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a dataset of YouTube comments in Arabic, specifically designed to be used for thedetection of offensive language in a machine learning scenario", + "Volume": "15,050", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "University of Limerick", + "Derived From": "nan", + "Paper Title": "Dataset Construction for the Detection of Anti-Social Behaviour in\r\nOnline Communication in Arabic", + "Paper Link": "https://core.ac.uk/download/pdf/211161742.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "OneDrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "offensive language detection, hate speech detection", + "Venue Title": "ACLING", + "Citations": "33.0", + "Venue Type": "conference", + "Venue Name": "nternational Conference on AI in Computational Linguistics", + "Authors": "Azalden Alakrot,Liam Murray,Nikola S. Nikolov", + "Affiliations": ",,", + "Abstract": "Abstract Warning: this paper contains a range of words which may cause offence. In recent years, many studies target anti-social behaviour such as offensive language and cyberbullying in online communication. Typically, these studies collect data from various reachable sources, the majority of the datasets being in English. However, to the best of our knowledge, there is no dataset collected from the YouTube platform targeting Arabic text and overall there are only a few datasets of Arabic text, collected from other social platforms for the purpose of offensive language detection. Therefore, in this paper we contribute to this field by presenting a dataset of YouTube comments in Arabic, specifically designed to be used for the detection of offensive language in a machine learning scenario. Our dataset contains a range of offensive language and flaming in the form of YouTube comments. We document the labelling process we have conducted, taking into account the difference in the Arab dialects and the diversity of perception of offensive language throughout the Arab world. Furthermore, statistical analysis of the dataset is presented, in order to make it ready for use as a training dataset for predictive modelling.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/covid-19-arabic-tweets-dataset.json b/datasets/covid-19-arabic-tweets-dataset.json new file mode 100644 index 0000000..de90636 --- /dev/null +++ b/datasets/covid-19-arabic-tweets-dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "COVID-19-Arabic-Tweets-Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/COVID_19_Arabic_Tweets_Dataset", + "Link": "https://github.com/SarahAlqurashi/COVID-19-Arabic-Tweets-Dataset", + "License": "CC BY-NC-SA 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "collection of Arabic tweets IDs related to novel coronavirus COVID-19.", + "Volume": "3,934,610", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Umm Al-Qura University", + "Derived From": "nan", + "Paper Title": "Large Arabic Twitter Dataset on COVID-19\r\n", + "Paper Link": "https://arxiv.org/pdf/2004.04315.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "behaviour analysis", + "Venue Title": "ArXiv", + "Citations": "26.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "S. Alqurashi,Ahmad Alhindi,E. Alanazi", + "Affiliations": ",,", + "Abstract": "The 2019 coronavirus disease (COVID-19), emerged late December 2019 in China, is now rapidly spreading across the globe. At the time of writing this paper, the number of global confirmed cases has passed two millions and half with over 180,000 fatalities. Many countries have enforced strict social distancing policies to contain the spread of the virus. This have changed the daily life of tens of millions of people, and urged people to turn their discussions online, e.g., via online social media sites like Twitter. In this work, we describe the first Arabic tweets dataset on COVID-19 that we have been collecting since January 1st, 2020. The dataset would help researchers and policy makers in studying different societal issues related to the pandemic. Many other tasks related to behavioral change, information sharing, misinformation and rumors spreading can also be analyzed.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json b/datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json new file mode 100644 index 0000000..1eafcfa --- /dev/null +++ b/datasets/covid-19_disinfo__covid-19_disinformation_twitter_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "COVID-19 Disinfo: COVID-19 Disinformation Twitter Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/COVID_19_Disinformation_ar", + "Link": "https://github.com/firojalam/COVID-19-disinformation", + "License": "CC BY-NC-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "With the emergence of the COVID-19 pandemic, the political and the medical aspects of disinformation merged as the problem got elevated to a whole new level to become the first global infodemic. Fighting this infodemic has been declared one of the most important focus areas of the World Health Organization, with dangers ranging from promoting fake cures, rumors, and conspiracy theories to spreading xenophobia and panic. Addressing the issue requires solving a number of challenging problems such as identifying messages containing claims, determining their check-worthiness and factuality, and their potential to do harm as well as the nature of that harm, to mention just a few. To address this gap, we release a large dataset of 16K manually annotated tweets for fine-grained disinformation analysis that focuses on COVID-19, combines the perspectives and the interests of journalists, fact-checkers, social media platforms, policy makers, and society, and covers Arabic, Bulgarian, Dutch, and English. Finally, we show strong evaluation results using pretrained Transformers, thus confirming the practical utility of the dataset in monolingual multilingual, and single task vs. multitask settings.", + "Volume": "5,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "Fighting the COVID-19 Infodemic: Modeling the Perspective of Journalists, Fact-Checkers, Social Media Platforms, Policy Makers, and the Society", + "Paper Link": "https://pure.rug.nl/ws/portalfiles/portal/203339411/2021.findings_emnlp.56.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "fact checking", + "Venue Title": "Findings of EMNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "findings of Conference on Empirical Methods in Natural Language Processing", + "Authors": "Firoj Alam, Shaden Shaar, Fahim Dalvi, Hassan Sajjad, Alex Nikolov, Hamdy Mubarak, Giovanni Da San Martino, Ahmed Abdelali, Nadir Durrani, Kareem Darwish, Abdulaziz Al-Homaid, Wajdi Zaghouani, Tommaso Caselli, Gijs Danoe, Friso Stolk, Britt Bruntink, Preslav Nakov", + "Affiliations": "nan", + "Abstract": "With the emergence of the COVID-19 pandemic, the political and the medical aspects of disinformation merged as the problem got elevated to a whole new level to become the first global infodemic. Fighting this infodemic has been declared one of the most important focus areas of the World Health Organization, with dangers ranging from promoting fake cures, rumors, and conspiracy theories to spreading xenophobia and panic. Addressing the issue requires solving a number of challenging problems such as identifying messages containing claims, determining their check-worthiness and factuality, and their potential to do harm as well as the nature of that harm, to mention just a few. To address this gap, we release a large dataset of 16K manually annotated tweets for fine-grained disinformation analysis that focuses on COVID-19, combines the perspectives and the interests of journalists, fact-checkers, social media platforms, policy makers, and society, and covers Arabic, Bulgarian, Dutch, and English. Finally, we show strong evaluation results using pretrained Transformers, thus confirming the practical utility of the dataset in monolingual multilingual, and single task vs. multitask settings.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/covid-fakes.json b/datasets/covid-fakes.json new file mode 100644 index 0000000..8de73e0 --- /dev/null +++ b/datasets/covid-fakes.json @@ -0,0 +1,36 @@ +{ + "Name": "COVID-FAKES", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/COVID_FAES_ar", + "Link": "https://github.com/mohaddad/COVID-FAKES", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Bilingual (Arabic/English) COVID-19 Twitter dataset for misleading information detection", + "Volume": "3,263,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "University of Victoria", + "Derived From": "nan", + "Paper Title": "COVID-19-FAKES: A Twitter (Arabic/English) Dataset for Detecting Misleading Information on COVID-19", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-030-57796-4_25", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "fake news detection", + "Venue Title": "INCoS", + "Citations": "17.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Intelligent Networking and Collaborative Systems", + "Authors": "Mohamed K. Elhadad,K. F. Li,F. Gebali", + "Affiliations": ",,", + "Abstract": "This paper aims to aid the ongoing research efforts for combating the Infodemic related to COVID-19. We provide an automatically annotated, bilingual (Arabic/English) COVID-19 Twitter dataset (COVID-19-FAKES). This dataset has been continuously collected from February 04, 2020, to March 10, 2020. For annotating the collected dataset, we utilized the shared information on the official websites and the official Twitter accounts of the WHO, UNICEF, and UN as a source of reliable information, and the collected COVID-19 pre-checked facts from different fact-checking websites to build a ground-truth database. Then, the Tweets in the COVID-19-FAKES dataset are annotated using 13 different machine learning algorithms and employing 7 different feature extraction techniques. We are making our dataset publicly available to the research community (https://github.com/mohaddad/COVID-FAKES). This work will help researchers in understanding the dynamics behind the COVID-19 outbreak on Twitter. Furthermore, it could help in studies related to sentiment analysis, the analysis of the propagation of misleading information related to this outbreak, the analysis of users\u2019 behavior during the crisis, the detection of botnets, the analysis of the performance of different classification algorithms with various feature extraction techniques that are used in text mining. It is worth noting that, in this paper, we use the terms of misleading information, misinformation, and fake news interchangeably.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/covost_2.json b/datasets/covost_2.json new file mode 100644 index 0000000..1e0c3dd --- /dev/null +++ b/datasets/covost_2.json @@ -0,0 +1,36 @@ +{ + "Name": "CoVoST 2", + "Subsets": [], + "HF Link": "https://hf.co/datasets/facebook/covost2", + "Link": "https://github.com/facebookresearch/covost", + "License": "CC0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "a large-scale multilingual ST corpus based on Common Voice, to foster ST research with the largest ever open dataset. Its latest version covers translations from English into 15 languages---Arabic, Catalan, Welsh, German, Estonian, Persian, Indonesian, Japanese, Latvian, Mongolian, Slovenian, Swedish, Tamil, Turkish, Chinese", + "Volume": "6", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Facebook AI", + "Derived From": "Common Voice", + "Paper Title": "CoVoST 2 and Massively Multilingual Speech-to-Text Translation", + "Paper Link": "https://arxiv.org/pdf/2007.10310.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Changhan Wang, Anne Wu, Juan Pino\n", + "Affiliations": "Facebook AI;Facebook AI;Facebook AI", + "Abstract": "Speech-to-text translation (ST) has recently become an increasingly popular topic of research,\npartly due to the development of benchmark\ndatasets. Nevertheless, current datasets cover\na limited number of languages. With the aim\nto foster research in massive multilingual ST\nand ST for low resource language pairs, we\nrelease CoVoST 2, a large-scale multilingual\nST corpus covering translations from 21 languages into English and from English into 15\nlanguages. This represents the largest open\ndataset available to date from total volume and\nlanguage coverage perspective. Data sanity\nchecks provide evidence about the quality of\nthe data, which is released under CC0 license.\nWe also provide extensive speech recognition,\nbilingual and multilingual machine translation\nand ST baselines with open-source implementation", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/cqa-md__semeval-2016_task_3.json b/datasets/cqa-md__semeval-2016_task_3.json new file mode 100644 index 0000000..81544f7 --- /dev/null +++ b/datasets/cqa-md__semeval-2016_task_3.json @@ -0,0 +1,36 @@ +{ + "Name": "CQA-MD: SemEval-2016 Task 3", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CQA_MD_ar", + "Link": "https://alt.qcri.org/semeval2016/task3/index.php?id=data-and-tools", + "License": "unknown", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "It includes a TRAIN/DEV split with reliable double-checked DEV (1,281 original questions, and 37,795 potentially related question-answer pairs) + unannotated (163,383 question--answer pairs)", + "Volume": "45,164", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "SemEval-2016 Task 3: Community Question Answering", + "Paper Link": "https://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-report.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "semEval", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Workshop on Semantic Evaluation", + "Authors": "Preslav Nakov, Llu\u00b4\u0131s Marquez, Alessandro Moschitti, `\nWalid Magdy, Hamdy Mubarak, Abed Alhakim Freihat", + "Affiliations": "ALT Research Group, Qatar Computing Research Institute, HBKU", + "Abstract": "This paper describes the SemEval\u20132016\nTask 3 on Community Question Answering, which we offered in English and Arabic. For English, we had three subtasks: Question\u2013Comment Similarity (subtask\nA), Question\u2013Question Similarity (B), and\nQuestion\u2013External Comment Similarity (C).\nFor Arabic, we had another subtask: Rerank\nthe correct answers for a new question (D).\nEighteen teams participated in the task, submitting a total of 95 runs (38 primary and 57\ncontrastive) for the four subtasks. A variety\nof approaches and features were used by the\nparticipating systems to address the different\nsubtasks, which are summarized in this paper.\nThe best systems achieved an official score\n(MAP) of 79.19, 76.70, 55.41, and 45.83 in\nsubtasks A, B, C, and D, respectively. These\nscores are significantly better than those for\nthe baselines that we provided. For subtask A,\nthe best system improved over the 2015 winner by 3 points absolute in terms of Accuracy", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/cross-lingual_ner.json b/datasets/cross-lingual_ner.json new file mode 100644 index 0000000..5db7ac8 --- /dev/null +++ b/datasets/cross-lingual_ner.json @@ -0,0 +1,36 @@ +{ + "Name": "Cross-lingual NER", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Zero_Shot_Cross_Lingual_NER_ar", + "Link": "https://github.com/ntunlp/Zero-Shot-Cross-Lingual-NER", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "NER for five different target languages\r\n\u2014 Spanish, Dutch, German, Arabic and Finnish", + "Volume": "2,687", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "AQMAR", + "Paper Title": "Zero-Resource Cross-Lingual Named Entity Recognition\r", + "Paper Link": "https://arxiv.org/pdf/1911.09812.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "named entity recognition", + "Venue Title": "AAAI", + "Citations": "19.0", + "Venue Type": "conference", + "Venue Name": "Association for the Advancement of Artificial Intelligence", + "Authors": "M SAIFUL BARI,Shafiq R. Joty,Prathyusha Jwalapuram", + "Affiliations": "Nanyang Technological University,,", + "Abstract": "Recently, neural methods have achieved state-of-the-art (SOTA) results in Named Entity Recognition (NER) tasks for many languages without the need for manually crafted features. However, these models still require manually annotated training data, which is not available for many languages. In this paper, we propose an unsupervised cross-lingual NER model that can transfer NER knowledge from one language to another in a completely unsupervised way without relying on any bilingual dictionary or parallel data. Our model achieves this through word-level adversarial learning and augmented fine-tuning with parameter sharing and feature augmentation. Experiments on five different languages demonstrate the effectiveness of our approach, outperforming existing models by a good margin and setting a new SOTA for each language pair.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/crosssum.json b/datasets/crosssum.json new file mode 100644 index 0000000..a7bffbf --- /dev/null +++ b/datasets/crosssum.json @@ -0,0 +1,36 @@ +{ + "Name": "CrossSum", + "Subsets": [], + "HF Link": "https://hf.co/datasets/csebuetnlp/CrossSum", + "Link": "https://github.com/csebuetnlp/CrossSum", + "License": "CC BY-NC-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "a large-scale dataset comprising 1.65 million cross-lingual article-summary samples in 1500+ language-pairs", + "Volume": "72,795", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "CrossSum: Beyond English-Centric Cross-Lingual Abstractive Text Summarization for 1500+ Language Pairs", + "Paper Link": "https://arxiv.org/pdf/2112.08804.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "summarization", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "preprint", + "Authors": "Tahmid Hasan, Abhik Bhattacharjee, Wasi Uddin Ahmad, Yuan-Fang Li, Yong-Bin Kang, Rifat Shahriyar\n", + "Affiliations": "Bangladesh University of Engineering and Technology (BUET), University of California, Los Angeles, Monash University, Swinburne University of Technology", + "Abstract": "We present CrossSum, a large-scale dataset\ncomprising 1.65 million cross-lingual articlesummary samples in 1500+ language-pairs\nconstituting 45 languages. We use the multilingual XL-Sum dataset and align identical articles written in different languages via crosslingual retrieval using a language-agnostic representation model. We propose a multi-stage\ndata sampling algorithm and fine-tune mT5,\na multilingual pretrained model, with explicit\ncross-lingual supervision with CrossSum and\nintroduce a new metric for evaluating crosslingual summarization. Results on established\nand our proposed metrics indicate that models\nfine-tuned on CrossSum outperforms summarization+translation baselines, even when the\nsource and target language pairs are linguistically distant. To the best of our knowledge,\nCrossSum is the largest cross-lingual summarization dataset and also the first-ever that does\nnot rely on English as the pivot language. We\nare releasing the dataset, alignment and training scripts, and the models to spur future research on cross-lingual abstractive summarization. The resources can be found at https:\n//github.com/csebuetnlp/CrossSum.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/cslu__22_languages_corpus.json b/datasets/cslu__22_languages_corpus.json new file mode 100644 index 0000000..0e703c6 --- /dev/null +++ b/datasets/cslu__22_languages_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "CSLU: 22 Languages Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005S26", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Produced by Center for Spoken Language Understanding and distributed by the Linguistic Data Consortium, the 22 Languages corpus consists of telephone speech from 21 languages: Eastern Arabic, Cantonese, Czech, Farsi, German, Hindi, Hungarian, Japanese, Korean, Malay, Mandarin, Italian, Polish, Portuguese, Russian, Spanish, Swedish, Swahili, Tamil, Vietnamese, and English. The corpus contains fixed vocabulary utterances (e.g. days of the week) as well as fluent continuous speech. Each of the 50,191 utterances is verified by a native speaker to determine if the caller followed instructions when answering the prompts. For this release, approximately 19,758 utterances have corresponding orthographic transcriptions in all the above languages except Eastern Arabic, Farsi, Korean, Russian, Italian.", + "Volume": "84", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ctab__corpus_of_tunisian_arabizi.json b/datasets/ctab__corpus_of_tunisian_arabizi.json new file mode 100644 index 0000000..61d3a7a --- /dev/null +++ b/datasets/ctab__corpus_of_tunisian_arabizi.json @@ -0,0 +1,36 @@ +{ + "Name": "CTAB: Corpus of Tunisian Arabizi", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/CTAB", + "Link": "https://zenodo.org/record/4781769#.YqSPY3ZBxD9", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "This dataset has been created between 2017 and 2021 to provide a textual resource that can be used to study the behaviors of Tunisian people in writing Tunisian Arabic (ISO 693-3: aeb) in Latin Script. This corpus is constituted from messages written using Tunisian Arabic Chat Alphabet or Arabizi and is developed to solve the matter of the lack of NLP databases about the use of the Latin Script for transcribing Tunisian Arabic. ", + "Volume": "5,702", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Sfax", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/culturax.json b/datasets/culturax.json new file mode 100644 index 0000000..b4b3e68 --- /dev/null +++ b/datasets/culturax.json @@ -0,0 +1,36 @@ +{ + "Name": "CulturaX", + "Subsets": [], + "HF Link": "https://hf.co/datasets/uonlp/CulturaX", + "Link": "https://hf.co/datasets/uonlp/CulturaX", + "License": "custom", + "Year": 2023, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "We present CulturaX, a substantial multilingual dataset with 6.3 trillion tokens in 167 languages, tailored for large language model (LLM) development. Our dataset undergoes meticulous cleaning and deduplication through a rigorous pipeline of multiple stages to accomplish the best quality for model training, including language identification, URL-based filtering, metric-based cleaning, document refinement, and data deduplication. We employ MinHash at document level to achieve fuzzy deduplication for the datasets in different languages. Our data cleaning framework includes diverse criteria and threshold selections, guided by extensive data samples, ensuring comprehensive noise filtering in various aspects. CulturaX is fully released to the public in HuggingFace to facilitate research and advancements in multilingual LLMs.", + "Volume": "74,027,952", + "Unit": "documents", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "mC4, OSCAR", + "Paper Title": "CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages", + "Paper Link": "https://hf.co/datasets/uonlp/CulturaX", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Thuat Nguyen, Chien Van Nguyen, Viet Dac Lai, Hieu Man, Nghia Trung Ngo, Franck Dernoncourt, Ryan A. Rossi, Thien Huu Nguyen", + "Affiliations": "nan", + "Abstract": "The driving factors behind the development of large language models (LLMs) with impressive learning capabilities are their colossal model sizes and extensive training datasets. Along with the progress in natural language processing, LLMs have been frequently made accessible to the public to foster deeper investigation and applications. However, when it comes to training datasets for these LLMs, especially the recent state-of-the-art models, they are often not fully disclosed. Creating training data for high-performing LLMs involves extensive cleaning and deduplication to ensure the necessary level of quality. The lack of transparency for training data has thus hampered research on attributing and addressing hallucination and bias issues in LLMs, hindering replication efforts and further advancements in the community. These challenges become even more pronounced in multilingual learning scenarios, where the available multilingual text datasets are often inadequately collected and cleaned. Consequently, there is a lack of open-source and readily usable dataset to effectively train LLMs in multiple languages. To overcome this issue, we present CulturaX, a substantial multilingual dataset with 6.3 trillion tokens in 167 languages, tailored for LLM development. Our dataset undergoes meticulous cleaning and deduplication through a rigorous pipeline of multiple stages to accomplish the best quality for model training, including language identification, URL-based filtering, metric-based cleaning, document refinement, and data deduplication. CulturaX is fully released to the public in HuggingFace to facilitate research and advancements in multilingual LLMs: this https URL.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/curras.json b/datasets/curras.json new file mode 100644 index 0000000..7804b07 --- /dev/null +++ b/datasets/curras.json @@ -0,0 +1,36 @@ +{ + "Name": "Curras", + "Subsets": [], + "HF Link": "nan", + "Link": "https://portal.sina.birzeit.edu/curras/download.html", + "License": "custom", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Curras: a dataset for Palestinian Arabic with rich metadata including POS tagging, lemma, stem, and other ", + "Volume": "56,700", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Curras: an annotated corpus for the Palestinian Arabic dialect", + "Paper Link": "http://www.jarrar.info/publications/JHRAZ17.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition, stemming, lemmatization", + "Venue Title": "LREC", + "Citations": "76.0", + "Venue Type": "conference", + "Venue Name": "Language Resource & Evaluation", + "Authors": "Mustafa Jarrar, Nizar Habash, Faeq Alrimawi, Diyam Akra, Nasser Zalmout", + "Affiliations": "Birzeit University, NYU", + "Abstract": "In this article we present Curras, the first morphologically annotated\ncorpus of the Palestinian Arabic dialect. Palestinian Arabic is one of the many\nprimarily spoken dialects of the Arabic language. Arabic dialects are generally\nunder-resourced compared to Modern Standard Arabic, the primarily written and\nofficial form of Arabic. We start in the article with a background description that\nsituates Palestinian Arabic linguistically and historically and compares it to Modern\nStandard Arabic and Egyptian Arabic in terms of phonological, morphological,\northographic, and lexical variations. We then describe the methodology we developed to collect Palestinian Arabic text to guarantee a variety of representative\ndomains and genres. We also discuss the annotation process we used, which\nextended previous efforts for annotation guideline development, and utilized\nexisting automatic annotation solutions for Standard Arabic and Egyptian Arabic.\nThe annotation guidelines and annotation meta-data are described in detail. The\nCurras Palestinian Arabic corpus consists of more than 56 K tokens, which are annotated with rich morphological and lexical features. The inter-annotator agreement results indicate a high degree of consistency.", + "Added By": "Maged S. Alshaibani" +} \ No newline at end of file diff --git a/datasets/daict.json b/datasets/daict.json new file mode 100644 index 0000000..b229848 --- /dev/null +++ b/datasets/daict.json @@ -0,0 +1,36 @@ +{ + "Name": "DAICT ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://www.hbku.edu.qa/en/DAICT", + "License": "custom", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The dataset includes 5,588 tweets -- written in both MSA and dialectual Arabic -- manually annotated by two professional linguistics from HBKU", + "Volume": "5,588", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Hamad Bin Khalifa University", + "Derived From": "nan", + "Paper Title": "DAICT: A Dialectal Arabic Irony Corpus Extracted from Twitter\r", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.768.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "irony detection", + "Venue Title": "LREC", + "Citations": "15.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Ines Abbes,W. Zaghouani,Omaima El-Hardlo,Faten Ashour", + "Affiliations": ",,,", + "Abstract": "Identifying irony in user-generated social media content has a wide range of applications; however to date Arabic content has received limited attention. To bridge this gap, this study builds a new open domain Arabic corpus annotated for irony detection. We query Twitter using irony-related hashtags to collect ironic messages, which are then manually annotated by two linguists according to our working definition of irony. Challenges which we have encountered during the annotation process reflect the inherent limitations of Twitter messages interpretation, as well as the complexity of Arabic and its dialects. Once published, our corpus will be a valuable free resource for developing open domain systems for automatic irony recognition in Arabic language and its dialects in social media text.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/dares.json b/datasets/dares.json new file mode 100644 index 0000000..a157dbb --- /dev/null +++ b/datasets/dares.json @@ -0,0 +1,36 @@ +{ + "Name": "DARES", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/DamithDR/arabic-readability-assessment", + "License": "CC BY-NC 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "books", + "Form": "text", + "Collection Style": "other", + "Description": "DARES is a dataset for Arabic readability estimation based on Saudi school textbooks. It contains 13,335 instances of text, focusing on the readability levels of different educational levels (grades 1-12). It is organized into two main subtasks: (a) Coarse-grained readability, classifying texts into broad educational levels (elementary, intermediate, high school), and (b) Fine-grained readability, classifying texts into individual grade levels.", + "Volume": "13,335", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "Saudi school textbooks (grades 1-12)", + "Paper Title": "DARES: Dataset for Arabic Readability Estimation of School Materials", + "Paper Link": "https://aclanthology.org/2024.determit-1.10.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "readability assessment", + "Venue Title": "LREC-COLING", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Workshop on DeTermIt! Evaluating Text Difficulty in a Multilingual Context @ LREC-COLING 2024", + "Authors": "Mo El-Haj, Sultan Almujaiwel, Damith Premasiri, Tharindu Ranasinghe, Ruslan Mitkov", + "Affiliations": "Lancaster University, King Saud University, Aston University", + "Abstract": "The DARES dataset is introduced as a resource for assessing Arabic text readability in the context of Saudi educational materials. It focuses on readability levels across different educational stages (grades 1-12) and consists of coarse-grained and fine-grained tasks. The dataset was used to fine-tune transformer models for readability estimation, demonstrating high performance in classifying educational materials by their readability level.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/dart.json b/datasets/dart.json new file mode 100644 index 0000000..364104a --- /dev/null +++ b/datasets/dart.json @@ -0,0 +1,67 @@ +{ + "Name": "DART ", + "Subsets": [ + { + "Name": "EGY", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "5,265", + "Unit": "sentences" + }, + { + "Name": "GLF", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "5,893", + "Unit": "sentences" + }, + { + "Name": "IRQ", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "5,253", + "Unit": "sentences" + }, + { + "Name": "LEV", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "3,939", + "Unit": "sentences" + }, + { + "Name": "MGH", + "Dialect": "ar-NOR: (Arabic (North Africa))", + "Volume": "3,930", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/DART", + "Link": "https://www.dropbox.com/s/jslg6fzxeu47flu/DART.zip?dl=0", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Dialectal Arabic Tweets", + "Volume": "24,280", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "DART: A Large Dataset of Dialectal Arabic Tweets\r", + "Paper Link": "https://aclanthology.org/L18-1579.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "Dropbox", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "dialect identification", + "Venue Title": "LREC", + "Citations": "15.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Israa Alsarsour,Esraa Mohamed,Reem Suwaileh,T. Elsayed", + "Affiliations": ",,,", + "Abstract": "In this paper, we present a new large manually-annotated multi-dialect dataset of Arabic tweets that is publicly available. The Dialectal ARabic Tweets (DART) dataset has about 25K tweets that are annotated via crowdsourcing and it is well-balanced over five main groups of Arabic dialects: Egyptian, Maghrebi, Levantine, Gulf, and Iraqi. The paper outlines the pipeline of constructing the dataset from crawling tweets that match a list of dialect phrases to annotating the tweets by the crowd. We also touch some challenges that we face during the process. We evaluate the quality of the dataset from two perspectives: the inter-annotator agreement and the accuracy of the final labels. Results show that both measures were substantially high for the Egyptian, Gulf, and Levantine dialect groups, but lower for the Iraqi andMaghrebi dialects, which indicates the difficulty of identifying those two dialectsmanually and hence automatically.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/database_of_arab_names.json b/datasets/database_of_arab_names.json new file mode 100644 index 0000000..85ddccd --- /dev/null +++ b/datasets/database_of_arab_names.json @@ -0,0 +1,36 @@ +{ + "Name": "Database of Arab Names", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-L0122/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Very comprehensive database of Arabic personal names and name variants mapped to the original Arabic script with a large variety of supplementary information. The database consists of 6,500,000 terms.", + "Volume": "6,500,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "11,250.00\u20ac", + "Test Split": "No", + "Tasks": "part of speech tagging", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/database_of_arabic_plurals.json b/datasets/database_of_arabic_plurals.json new file mode 100644 index 0000000..5bccd49 --- /dev/null +++ b/datasets/database_of_arabic_plurals.json @@ -0,0 +1,36 @@ +{ + "Name": "Database of Arabic Plurals", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-L0121/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This database covers both regular and irregular Arabic plurals, and was developed by experts over a period of several years. The data includes various grammatical attributes such as part-of-speech, collectivity codes, gender codes, and full vocalization.", + "Volume": "nan", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "1,875.00\u20ac", + "Test Split": "No", + "Tasks": "grammatical analysis, gender identification, speech recognition, part of speech tagging", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/database_of_foreign_names_in_arabic.json b/datasets/database_of_foreign_names_in_arabic.json new file mode 100644 index 0000000..0c531e3 --- /dev/null +++ b/datasets/database_of_foreign_names_in_arabic.json @@ -0,0 +1,36 @@ +{ + "Name": "Database of Foreign Names in Arabic", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-L0124/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "This database covers non-Arabic names, their Arabic equivalents, and Arabic script variants for each name (with the most important variant given first).", + "Volume": "nan", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "3,750.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/dataset_for_arabic_classification.json b/datasets/dataset_for_arabic_classification.json new file mode 100644 index 0000000..7af6f35 --- /dev/null +++ b/datasets/dataset_for_arabic_classification.json @@ -0,0 +1,36 @@ +{ + "Name": "DataSet for Arabic Classification", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/DataSet_Arabic_Classification", + "Link": "https://data.mendeley.com/datasets/v524p5dhpj/2", + "License": "CC BY 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "DataSet for Arabic text classification. The dataset, as mentioned by the author's description has been collected semi-automatically.", + "Volume": "111,700", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Universite Sultan Moulay Slimane de Beni-Mellal, Universite Chouaib Doukkali Faculte des Sciences", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "mohamed BINIZ", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Maged S. Alshaibani" +} \ No newline at end of file diff --git a/datasets/dataset_for_evaluating_root_extraction.json b/datasets/dataset_for_evaluating_root_extraction.json new file mode 100644 index 0000000..25a4729 --- /dev/null +++ b/datasets/dataset_for_evaluating_root_extraction.json @@ -0,0 +1,36 @@ +{ + "Name": "Dataset for evaluating root extraction", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/root_extraction_vaidation", + "Link": "https://github.com/arabic-digital-humanities/root-extraction-validation-data", + "License": "CC BY 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This dataset contains data to evaluate the roots extracted by Arabic stemmers and morphological analyzers.", + "Volume": "2,962", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "OpenITI project", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "lemmatization ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json b/datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json new file mode 100644 index 0000000..641c1e5 --- /dev/null +++ b/datasets/dawqas__a_dataset_for_arabic_why_question_answering_system.json @@ -0,0 +1,36 @@ +{ + "Name": "DAWQAS: A Dataset for Arabic Why Question Answering System", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/DAWQAS", + "Link": "https://github.com/masun/DAWQAS", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A Dataset for Arabic Why Question Answering System", + "Volume": "3,205", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "DAWQAS: A Dataset for Arabic Why Question Answering System", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S1877050918321690", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "ACLING", + "Citations": "6.0", + "Venue Type": "conference", + "Venue Name": "nternational Conference on AI in Computational Linguistics", + "Authors": "W. S. Ismail,Masun Nabhan Homsi", + "Affiliations": ",", + "Abstract": "Abstract A why question answering system is a tool designed to answer why-questions posed in natural language. Several papers have been published on the problem of answering why-questions. In particular, attempts have been made to analyze Arabic text and predict which passages are best candidates for the why-questions; employing different datasets with limited size and not publicly available. To overcome these limitations, this paper introduces the new publicly available dataset, DAWQAS: Dataset for Arabic Why Question Answering System. It consists of 3205 of why question-answer pairs that were first scraped from public Arabic websites, then texts were preprocessed and converted to feature vectors. Afterwards, why-answers were re-categorized based on their domains. Finally, the rhetorical relations\u2019 probabilities based on discourse markers were computed for each sentence in the dataset. DAWQAS is a valuable resource for research and evaluation in language understanding. The new dataset is freely available at https://github.com/masun/DAWQAS .", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/defarabicqa.json b/datasets/defarabicqa.json new file mode 100644 index 0000000..13d3605 --- /dev/null +++ b/datasets/defarabicqa.json @@ -0,0 +1,36 @@ +{ + "Name": "DefArabicQA", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sites.google.com/site/anlprg/outils-et-corpus-realises", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "2000 snippets returned by Google search engine and Wikipedia Arabic version\nand a set of 50 organization definition questions", + "Volume": "2,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "DefArabicQA: Arabic Definition Question Answering System \r", + "Paper Link": "http://personales.upv.es/prosso/resources/TriguiEtAl_LREC10.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "LREC", + "Citations": "51.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Omar Trigui,L. Belguith,P. Rosso", + "Affiliations": ",,", + "Abstract": "Today the Web is the largest resource of knowledge and, therefore, sometimes this makes it difficult to find precise information. Current search engines can only return ranked snippets containing the effective answers to a query user. But, they can not return the exact answers. Question Answering systems present the solution to obtain effective and exact answers to a user question asked in natural language question instead of keywords query. Unfortunately, Question Answering task for the Arabic language has not been investigated enough in the last decade, compared to other languages. In this paper, we tackle the definition Question Answering task for the Arabic language. We propose an Arabic definitional Question Answering system based on a pattern approach to identify exact and accurate definitions about organization using Web resources. We experimented this system using 2000 snippets returned by Google search engine and Wikipedia Arabic version and a set of 50 organization definition questions. The obtained results are very encouraging: (90%) of the questions used have complete (vital) definitions in the top-five answers and (64%) of them have complete definitions in the top-one answer. MRR was (0.81).", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/detect_egyptian_wikipedia_template-translated_articles.json b/datasets/detect_egyptian_wikipedia_template-translated_articles.json new file mode 100644 index 0000000..f525bc9 --- /dev/null +++ b/datasets/detect_egyptian_wikipedia_template-translated_articles.json @@ -0,0 +1,36 @@ +{ + "Name": "Detect Egyptian Wikipedia Template-translated Articles ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/Detect-Egyptian-Wikipedia-Articles", + "Link": "https://github.com/SaiedAlshahrani/leveraging-corpus-metadata", + "License": "MIT License", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "manual curation", + "Description": "A labeled dataset of Egyptian Arabic Wikipedia articles extracted from Wikipedia dumps 2024-01-01, along with their metadata.", + "Volume": "755,665", + "Unit": "documents", + "Ethical Risks": "Medium", + "Provider": "Clarkson University", + "Derived From": "Egyptian Arabic Wikipedia", + "Paper Title": "Leveraging Corpus Metadata to Detect Template-based Translation: An Exploratory Case Study of the Egyptian Arabic Wikipedia Edition", + "Paper Link": "https://arxiv.org/pdf/2404.00565.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language modelling, topic classification, language identification, text classification", + "Venue Title": "OSACT6", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "The 6th Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Saied Alshahrani, Hesham Haroon, Ali Elfilali, Mariama Njie, Jeanna Matthews", + "Affiliations": "Clarkson University, Sesame Labs, Cadi Ayyad University, M&T Bank", + "Abstract": "Wikipedia articles (content pages) are commonly used corpora in Natural Language Processing (NLP) research, especially in low-resource languages other than English. Yet, a few research studies have studied the three Arabic Wikipedia editions, Arabic Wikipedia (AR), Egyptian Arabic Wikipedia (ARZ), and Moroccan Arabic Wikipedia (ARY), and documented issues in the Egyptian Arabic Wikipedia edition regarding the massive automatic creation of its articles using template-based translation from English to Arabic without human involvement, overwhelming the Egyptian Arabic Wikipedia with articles that do not only have low-quality content but also with articles that do not represent the Egyptian people, their culture, and their dialect. In this paper, we aim to mitigate the problem of template translation that occurred in the Egyptian Arabic Wikipedia by identifying these template-translated articles and their characteristics through exploratory analysis and building automatic detection systems. We first explore the content of the three Arabic Wikipedia editions in terms of density, quality, and human contributions and utilize the resulting insights to build multivariate machine learning classifiers leveraging articles' metadata to detect the template-translated articles automatically. We then publicly deploy and host the best-performing classifier, XGBoost, as an online application called EGYPTIAN WIKIPEDIA SCANNER and release the extracted, filtered, and labeled datasets to the research community to benefit from our datasets and the online, web-based detection system. ", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/dialectal_arabic_code-switching_dataset.json b/datasets/dialectal_arabic_code-switching_dataset.json new file mode 100644 index 0000000..3257af4 --- /dev/null +++ b/datasets/dialectal_arabic_code-switching_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Dialectal Arabic Code-Switching Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Dialectal_Speech_Code_Switching", + "Link": "https://github.com/qcri/Arabic_speech_code_switching", + "License": "MIT License", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "human translation", + "Description": "The dataset studies code-switching between Egyptian and modern standard Arabic in broadcast domain.", + "Volume": "2", + "Unit": "hours", + "Ethical Risks": "Medium", + "Provider": "QCRI", + "Derived From": "ADI-5", + "Paper Title": "Effects of Dialectal Code-Switching on Speech Modules: A Study using Egyptian Arabic Broadcast Speech", + "Paper Link": "http://www.interspeech2020.org/uploadfile/pdf/Wed-1-10-5.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "word-level code switching, code switching", + "Venue Title": "nan", + "Citations": "5.0", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Chowdhury, Shammur Absar and Samih, Younes and Eldesouki, Mohamed and Ali, Ahmed", + "Affiliations": "nan", + "Abstract": "The intra-utterance code-switching (CS) is defined as the alternation between two or more languages within the same utterance. Despite the fact that spoken dialectal code-switching (DCS) is more challenging than CS, it remains largely unexplored. In this study, we describe a method to build the first spoken DCS corpus. The corpus is annotated at the token-level minding both linguistic and acoustic cues for dialectal Arabic. For detailed analysis, we study Arabic automatic speech recognition (ASR), Arabic dialect identification (ADI), and natural language processing (NLP) modules for the DCS corpus. Our results highlight the importance of lexical information for discriminating the DCS labels. We observe that the performance of different models is highly dependent on the degree of code-mixing at the token-level as well as its complexity at the utterance-level.", + "Added By": "Nouamane Tazi" +} \ No newline at end of file diff --git a/datasets/dialex.json b/datasets/dialex.json new file mode 100644 index 0000000..172ec81 --- /dev/null +++ b/datasets/dialex.json @@ -0,0 +1,67 @@ +{ + "Name": "DiaLex", + "Subsets": [ + { + "Name": "Algerian", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "607", + "Unit": "sentences" + }, + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "588", + "Unit": "sentences" + }, + { + "Name": "Lebanese", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "633", + "Unit": "sentences" + }, + { + "Name": "Syrian", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "593", + "Unit": "sentences" + }, + { + "Name": "Tunisian", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "649", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/dialex", + "Link": "https://github.com/UBC-NLP/dialex", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "A Benchmark for Evaluating Multidialectal Arabic Word Embeddings", + "Volume": "3,070", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "DiaLex: A Benchmark for Evaluating Multidialectal Arabic Word Embeddings", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.2", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "benchmarking multidialectal word embeddings", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Muhammad Abdul-Mageed, Shady Elbassuoni, Jad Doughman, AbdelRahim Elmadany, El Moatez Billah Nagoudi, Yorgo Zoughby, Ahmad Shaher, Iskander Gaba, Ahmed Helal, and Mohammed El-Razzaz.", + "Affiliations": "nan", + "Abstract": "Word embeddings are a core component of modern natural language processing systems, making the ability to thoroughly evaluate them a vital task. We describe DiaLex, a benchmark for intrinsic evaluation of dialectal Arabic word embeddings. DiaLex covers five important Arabic dialects: Algerian, Egyptian, Lebanese, Syrian, and Tunisian. Across these dialects, DiaLex provides a testbank for six syntactic and semantic relations, namely male to female, singular to dual, singular to plural, antonym, comparative, and genitive to past tense. DiaLex thus consists of a collection of word pairs representing each of the six relations in each of the five dialects. To demonstrate the utility of DiaLex, we use it to evaluate a set of existing and new Arabic word embeddings that we developed. Beyond evaluation of word embeddings, DiaLex supports efforts to integrate dialects into the Arabic language curriculum. It can be easily translated into Modern Standard Arabic and English, which can be useful for evaluating word translation. Our benchmark, evaluation code, and new word embedding models will be publicly available.", + "Added By": "Iskander Gaba" +} \ No newline at end of file diff --git a/datasets/disease_ner.json b/datasets/disease_ner.json new file mode 100644 index 0000000..779e323 --- /dev/null +++ b/datasets/disease_ner.json @@ -0,0 +1,36 @@ +{ + "Name": "Disease NER", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Disease_NER", + "Link": "https://www.mdpi.com/2306-5729/5/3/60/htm#app1-data-05-00060", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The data consist of 27 Arabic medical articles, totaling around 50,000 words.", + "Volume": "62,506", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Jouf University", + "Derived From": "BRAD 1.0", + "Paper Title": "An Arabic Dataset for Disease Named Entity Recognition with Multi-Annotation Schemes", + "Paper Link": "https://www.mdpi.com/2306-5729/5/3/60/htm", + "Script": "Arab", + "Tokenized": "No", + "Host": "MDPI", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "Data", + "Citations": "2.0", + "Venue Type": "journal", + "Venue Name": "nan", + "Authors": "Nasser O. Alshammari,S. Alanazi", + "Affiliations": ",", + "Abstract": "This article outlines a novel data descriptor that provides the Arabic natural language processing community with a dataset dedicated to named entity recognition tasks for diseases. The dataset comprises more than 60 thousand words, which were annotated manually by two independent annotators using the inside\u2013outside (IO) annotation scheme. To ensure the reliability of the annotation process, the inter-annotator agreements rate was calculated, and it scored 95.14%. Due to the lack of research efforts in the literature dedicated to studying Arabic multi-annotation schemes, a distinguishing and a novel aspect of this dataset is the inclusion of six more annotation schemes that will bridge the gap by allowing researchers to explore and compare the effects of these schemes on the performance of the Arabic named entity recognizers. These annotation schemes are IOE, IOB, BIES, IOBES, IE, and BI. Additionally, five linguistic features, including part-of-speech tags, stopwords, gazetteers, lexical markers, and the presence of the definite article, are provided for each record in the dataset.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/dixaf__bilingual_dictionary_french_arabic,_arabic_french.json b/datasets/dixaf__bilingual_dictionary_french_arabic,_arabic_french.json new file mode 100644 index 0000000..4efa092 --- /dev/null +++ b/datasets/dixaf__bilingual_dictionary_french_arabic,_arabic_french.json @@ -0,0 +1,36 @@ +{ + "Name": "DixAF: Bilingual Dictionary French Arabic, Arabic French", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-M0040/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2004, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "DixAF (Dictionnaire bilingue fran\u00e7ais arabe, arabe fran\u00e7ais - Bilingual Dictionary French Arabic, Arabic French) is a joint ownership of CNRS/ENS lettres et sciences humaines. It was developed by Mr Fathi Debili, a CNRS officer, and it consists of around 125,000 binary links between ca. 43,800 French entries and ca. 35,000 Arabic entries. ", + "Volume": "35,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "18,000.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/doda__darija_open_dataset.json b/datasets/doda__darija_open_dataset.json new file mode 100644 index 0000000..e50b1a8 --- /dev/null +++ b/datasets/doda__darija_open_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "DODA: Darija Open Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/darija", + "Link": "https://github.com/darija-open-dataset/dataset", + "License": "MIT License", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "more than 10,000 words", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Unknown", + "Derived From": "nan", + "Paper Title": "Moroccan Dialect -Darija- Open Dataset", + "Paper Link": "https://arxiv.org/ftp/arxiv/papers/2103/2103.09687.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "transliteration, machine translation", + "Venue Title": "ArXiv", + "Citations": "0.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Aissam Outchakoucht,Hamza Es-Samaali", + "Affiliations": ",", + "Abstract": "Nowadays, we are witnessing an unprecedented growth of IT products and services. Yet, in order for many of these solutions to flourish and be viable in a given society, they need to \u00ab understand \u00bb and be able to communicate to some extent using native languages. However, it turns out that step 0 in any serious engagement with Natural Language Processing (NLP) consists of translating the vocabulary to the widely used and most documented language in this field, namely English.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/dzdc12.json b/datasets/dzdc12.json new file mode 100644 index 0000000..0b1c869 --- /dev/null +++ b/datasets/dzdc12.json @@ -0,0 +1,36 @@ +{ + "Name": "DZDC12", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/xprogramer/DZDC12", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "DZDC12 is a multi-purpose parallel corpus crawled from facebook", + "Volume": "2,400", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Department of Electronics and Telecommunications, Universit\u00e9 8 Mai 1945 Guelma,24000,Guelma,Algeria", + "Derived From": "nan", + "Paper Title": "DZDC12: a new multipurpose parallel Algerian Arabizi\u2013French code-switched corpus", + "Paper Link": "https://link.springer.com/article/10.1007/s10579-019-09454-8", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, language modeling, dialect identification, information retrieval, offensive language detection, gender identification, natural language inference, user behavior", + "Venue Title": "LREC", + "Citations": "2.0", + "Venue Type": "journal", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Kheireddine Abainia", + "Affiliations": "PIMIS Laboratory, Department of Electronics and Telecommunications, Universit\u00e9 8 Mai 1945 Guelma,24000,Guelma,Algeria", + "Abstract": "Algeria\u2019s socio-linguistic situation is known as a complex phenomenon involving several historical, cultural and technological factors. However, there are three languages that are mainly spoken in Algeria (Arabic, Tamazight and French) and they can be mixed in the same sentence (code-switching). Moreover, there are several varieties of dialects that differ from one region to another and sometimes within the same region. This paper aims to provide a new multi-purpose parallel corpus (i.e., DZDC12 corpus), which will serve as a testbed for various natural language processing and information retrieval applications. In particular, it can be a useful tool to study Arabic\u2013French code-switching phenomenon, Algerian Romanized Arabic (Arabizi), different Algerian sub-dialects, sentiment analysis, gender writing style, machine translation, abuse detection, etc. To the best of our knowledge, the proposed corpus is the first of its kind, where the texts are written in Latin script and crawled from Facebook. More specifically, this corpus is organised by gender, region and city, and is transliterated into Arabic script and translated into Modern Standard Arabic. In addition, it is annotated for emotion detection and abuse detection, and annotated at the word level. This article focuses in particular on Algeria\u2019s socio-linguistic situation and the effect of social media networks. Furthermore, the general guidelines for the design of DZDC12 corpus are described as well as the dialects clustering over the map.", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file diff --git a/datasets/dziribert.json b/datasets/dziribert.json new file mode 100644 index 0000000..f48e578 --- /dev/null +++ b/datasets/dziribert.json @@ -0,0 +1,36 @@ +{ + "Name": "DziriBERT", + "Subsets": [], + "HF Link": "https://hf.co/alger-ia/dziribert", + "Link": "https://github.com/alger-ia/dziribert", + "License": "Apache-2.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The DziriBERT dataset contains over 1.1 million tweets written in the Algerian dialect, collected from Twitter. It includes tweets written in both Arabic script and Romanized script (Arabizi). The dataset is designed to develop language models specifically for the Algerian dialect, which differs from Modern Standard Arabic (MSA).", + "Volume": "1,100,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "DziriBERT: a Pre-trained Language Model for the Algerian Dialect", + "Paper Link": "https://arxiv.org/pdf/2109.12346 ", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis, language modeling, topic classification, emotion detection", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Amine Abdaoui, Mohamed Berrimi, Mourad Oussalah, Abdelouahab Moussaoui", + "Affiliations": "Oracle, University of Ferhat Abbas 1, University of Oulu", + "Abstract": "Pre-trained transformers are now the de facto models in Natural Language Processing given their state-of-the-art results in many tasks and languages. However, most of the current models have been trained on languages for which large text resources are already available (such as English, French, Arabic, etc.). Therefore, there are still a number of low-resource languages that need more attention from the community. In this paper, we study the Algerian dialect, which has several specificities that make the use of Arabic or multilingual models inappropriate. To address this issue, we collected more than one million Algerian tweets, and pre-trained the first Algerian language model: DziriBERT. When compared with existing models, DziriBERT achieves better results, especially when dealing with the Roman script. The obtained results show that pre-training a dedicated model on a small dataset (150 MB) can outperform existing models that have been trained on much more data (hundreds of GB). Finally, our model is publicly available to the community.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/easc.json b/datasets/easc.json new file mode 100644 index 0000000..8197877 --- /dev/null +++ b/datasets/easc.json @@ -0,0 +1,36 @@ +{ + "Name": "EASC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/EASC", + "Link": "https://sourceforge.net/projects/easc-corpus/", + "License": "CC BY-SA 3.0", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "153 Arabic articles and 765 human-generated extractive summaries of those articles", + "Volume": "153", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "University of Essex", + "Derived From": "nan", + "Paper Title": "Using Mechanical Turk to Create a Corpus of Arabic Summaries\r", + "Paper Link": "http://repository.essex.ac.uk/4064/1/LREC2010_MTurk.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "summarization", + "Venue Title": "other", + "Citations": "59.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Mahmoud El-Haj,Udo Kruschwitz,C. Fox", + "Affiliations": "Lancaster University,University of Regensburg,", + "Abstract": "This paper describes the creation of a human-generated corpus of extractive Arabic summaries of a selection of Wikipedia and Arabic newspaper articles using Mechanical Turk?an online workforce. The purpose of this exercise was two-fold. First, it addresses a shortage of relevant data for Arabic natural language processing. Second, it demonstrates the application of Mechanical Turk to the problem of creating natural language resources. The paper also reports on a number of evaluations we have performed to compare the collected summaries against results obtained from a variety of automatic summarisation systems.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/edgad.json b/datasets/edgad.json new file mode 100644 index 0000000..b721803 --- /dev/null +++ b/datasets/edgad.json @@ -0,0 +1,36 @@ +{ + "Name": "EDGAD", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/shery91/Egyptian-Dialect-Gender-Annotated-Dataset", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Egyptian Dialect Gender Annotated Dataset (EDGAD) obtained from Twitter as well as a proposed text classification solution for the Gender Identification (GI) problem. The dataset consists of 70,000 tweets per gender", + "Volume": "140,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Cairo University", + "Derived From": "nan", + "Paper Title": "Gender identification of egyptian dialect in twitter", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S1110866518302044", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "gender identification", + "Venue Title": "EIJ", + "Citations": "6.0", + "Venue Type": "journal", + "Venue Name": "Egyptian Informatics Journal", + "Authors": "Shereen Hussein,Mona Farouk,E. Hemayed", + "Affiliations": ",,", + "Abstract": "Abstract Despite the widespread of social media among all age groups in Arabic countries, the research directed towards Author Profiling (AP) is still in its early stages. This paper provides an Egyptian Dialect Gender Annotated Dataset (EDGAD) obtained from Twitter as well as a proposed text classification solution for the Gender Identification (GI) problem. The dataset consists of 70,000 tweets per gender. In text classification, a Mixed Feature Vector (MFV) with different stylometric and Egyptian Arabic Dialect (EAD) language-specific features is proposed, in addition to N-Gram Feature Vector (NFV). Ensemble weighted average is applied to the Random Forest (RF) with MFV and Logistic Regression (LR) with NFV. The achieved gender identification accuracy is 87.6%.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/egyptian_arabic_wikipedia_20230101.json b/datasets/egyptian_arabic_wikipedia_20230101.json new file mode 100644 index 0000000..c341ac3 --- /dev/null +++ b/datasets/egyptian_arabic_wikipedia_20230101.json @@ -0,0 +1,36 @@ +{ + "Name": "Egyptian_Arabic_Wikipedia_20230101", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/Egyptian_Arabic_Wikipedia_20230101", + "Link": "https://hf.co/datasets/SaiedAlshahrani/Egyptian_Arabic_Wikipedia_20230101", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Egyptian_Arabic_Wikipedia_20230101 is a dataset created using the Egyptian Arabic Wikipedia articles, downloaded on the 1st of January 2023, and processed to train an Egyptian Arabic RoBERTa model.", + "Volume": "728,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Clarkson University", + "Derived From": "Egyptian Arabic Wikipedia Dump 2023-01-01", + "Paper Title": "Performance Implications of Using Unrepresentative Corpora in Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.19.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "ArabicNLP 2023", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The First Arabic Natural Language Processing Conference", + "Authors": "Saied Alshahrani, Norah Alshahrani, Soumyabrata Dey, Jeanna Matthews", + "Affiliations": "Clarkson University", + "Abstract": "Wikipedia articles are a widely used source of training data for Natural Language Processing (NLP) research, particularly as corpora for low-resource languages like Arabic. However, it is essential to understand the extent to which these corpora reflect the representative contributions of native speakers, especially when many entries in a given language are directly translated from other languages or automatically generated through automated mechanisms. In this paper, we study the performance implications of using inorganic corpora that are not representative of native speakers and are generated through automated techniques such as bot generation or automated template-based translation. The case of the Arabic Wikipedia editions gives a unique case study of this since the Moroccan Arabic Wikipedia edition (ARY) is small but representative, the Egyptian Arabic Wikipedia edition (ARZ) is large but unrepresentative, and the Modern Standard Arabic Wikipedia edition (AR) is both large and more representative. We intrinsically evaluate the performance of two main NLP upstream tasks, namely word representation and language modeling, using word analogy evaluations and fill-mask evaluations using our two newly created datasets: Arab States Analogy Dataset (ASAD) and Masked Arab States Dataset (MASD). We demonstrate that for good NLP performance, we need both large and organic corpora; neither alone is sufficient. We show that producing large corpora through automated means can be a counter-productive, producing models that both perform worse and lack cultural richness and meaningful representation of the Arabic language and its native speakers.", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/egyptian_colloquial_arabic_lexicon.json b/datasets/egyptian_colloquial_arabic_lexicon.json new file mode 100644 index 0000000..728146b --- /dev/null +++ b/datasets/egyptian_colloquial_arabic_lexicon.json @@ -0,0 +1,36 @@ +{ + "Name": "Egyptian Colloquial Arabic Lexicon", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC99L22", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "The lexicon contains 51,202 entries, drawn from 140 CALLHOME telephone conversations among native speakers of Colloquial Egyptian Arabic, collected and published by the LDC as follows: CALLHOME Egyptian Arabic Speech LDC97S45, CALLHOME Egyptian Arabic Transcripts LDC97T19, CALLHOME Egyptian Arabic Speech Supplement LDC200237 and CALLHOME Egyptian Arabic Transcripts Supplement LDC2002T38. The lexicon also contains entries derived manually from the Badawi & Hines dictionary of Colloquial Egyptian Arabic. ", + "Volume": "51,202", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/elecmorocco2016.json b/datasets/elecmorocco2016.json new file mode 100644 index 0000000..b2489ca --- /dev/null +++ b/datasets/elecmorocco2016.json @@ -0,0 +1,36 @@ +{ + "Name": "ElecMorocco2016", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ElecMorocco", + "Link": "https://github.com/sentiprojects/ElecMorocco2016", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "A sentiment analysis dataset containing 10254 Arabic facebook comments about the Moroccan elections of 2016. The comments are written in standard arabic and morrocan dialect.", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Collecting and Processing Arabic Facebook Comments for Sentiment Analysis", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-319-66854-3_20", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "MEDI", + "Citations": "5.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Model and Data Engineering", + "Authors": "Abdeljalil Elouardighi, Mohcine Maghfour, Hafdalla Hammia", + "Affiliations": "nan", + "Abstract": "Social networks platforms such as Facebook are becoming one of the most powerful sources for information. The produced and shared data are important in volume, in velocity and in variety. Processing these data in the raw state to extract useful information can be a very difficult task and a big challenge. Furthermore, the Arabic language under its modern standard or dialectal shape is one of the languages producing an important quantity of data in social networks and the least analyzed. The characteristics and the specificity of the Arabic language present a big challenge for sentiment analysis, especially if this analysis is performed on Arabic Facebook comments. In this paper, we present a methodology that we have elaborated, for collecting and preprocessing Facebook comments written in Modern Standard Arabic (MSA) or in Moroccan Dialectal Arabic (MDA) for Sentiment Analysis (SA) using supervised classification methods. In this methodology, we have detailed the processing applied to the comments\u2019 text as well as various schemes of features\u2019 construction (words or groups of words) useful for supervised sentiments\u2019 classification. This methodology was tested on comments written in MSA or in MDA collected from Facebook for the sentiment analysis on a political phenomenon. The experiments\u2019 results obtained are promising and this encourages us to continue working on this topic.", + "Added By": "Abderrahmane Issam" +} \ No newline at end of file diff --git a/datasets/emoji-sentiment-dataset.json b/datasets/emoji-sentiment-dataset.json new file mode 100644 index 0000000..3af7de0 --- /dev/null +++ b/datasets/emoji-sentiment-dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "emoji-sentiment-dataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/snakers4/emoji-sentiment-dataset/tree/master#dataset", + "License": "custom", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "Following the success of DeepMoji and TorchMoji (1, 2), we would like to leverage Twitter as an open source of self-annotated data to create a balanced multi-language \"in-the-wild\" sentiment dataset to test the quality of various NLP models and/or word/sub-word tokenization techniques.", + "Volume": "287,578", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Emad A Alghamdi" +} \ No newline at end of file diff --git a/datasets/emotional-tone.json b/datasets/emotional-tone.json new file mode 100644 index 0000000..9230901 --- /dev/null +++ b/datasets/emotional-tone.json @@ -0,0 +1,36 @@ +{ + "Name": "Emotional-Tone", + "Subsets": [], + "HF Link": "https://hf.co/datasets/emotone-ar-cicling2017/emotone_ar", + "Link": "https://github.com/AmrMehasseb/Emotional-Tone", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "emotion detection dataset", + "Volume": "10,065", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Nile University", + "Derived From": "nan", + "Paper Title": "Emotional Tone Detection in Arabic Tweets", + "Paper Link": "https://www.researchgate.net/profile/Samhaa-El-Beltagy/publication/320271778_Emotional_Tone_Detection_in_Arabic_Tweets/links/59d9f0a5458515a5bc2b1d8a/Emotional-Tone-Detection-in-Arabic-Tweets.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "emotion detection", + "Venue Title": "CICLing", + "Citations": "10.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computational Linguistics and Intelligent Text Processing", + "Authors": "Amr Al-Khatib,S. El-Beltagy", + "Affiliations": ",", + "Abstract": "Emotion detection in Arabic text is an emerging research area, but the efforts in this new field have been hindered by the very limited availability of Arabic datasets annotated with emotions. In this paper, we review work that has been carried out in the area of emotion analysis in Arabic text. We then present an Arabic tweet dataset that we have built to serve this task. The efforts and methodologies followed to collect, clean, and annotate our dataset are described and preliminary experiments carried out on this dataset for emotion detection are presented. The results of these experiments are provided as a benchmark for future studies and comparisons with other emotion detection models. The best results over a set of eight emotions were obtained using a complement Naive Bayes algorithm with an overall accuracy of 68.12%.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/english-arabic_treebank_v_1_0.json b/datasets/english-arabic_treebank_v_1_0.json new file mode 100644 index 0000000..9f34009 --- /dev/null +++ b/datasets/english-arabic_treebank_v_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "English-Arabic Treebank v 1.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T10", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The guidelines followed for both part-of-speech and treebank annotation are essentially Penn Treebank II style, with two notable differences:", + "Volume": "224", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval,cross-lingual information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/everyayah.json b/datasets/everyayah.json new file mode 100644 index 0000000..9912f2b --- /dev/null +++ b/datasets/everyayah.json @@ -0,0 +1,36 @@ +{ + "Name": "EveryAyah", + "Subsets": [], + "HF Link": "https://hf.co/datasets/tarteel-ai/everyayah", + "Link": "https://hf.co/datasets/tarteel-ai/everyayah", + "License": "MIT License", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "This dataset is a collection of Quranic verses and their transcriptions, with diacritization, by different reciters.", + "Volume": "127,400", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Tarteel AI", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/evetar.json b/datasets/evetar.json new file mode 100644 index 0000000..5b9bbe4 --- /dev/null +++ b/datasets/evetar.json @@ -0,0 +1,36 @@ +{ + "Name": "EveTAR", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sites.google.com/view/bigir/datasets?authuser=0#h.p_dB9cxP-26Xnc", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a crawl of 355M Arabic tweets and covers 50 significant events", + "Volume": "3,550,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "building a large-scale multi-task test collection over Arabic tweets", + "Paper Link": "https://link.springer.com/article/10.1007/s10791-017-9325-7", + "Script": "Arab", + "Tokenized": "No", + "Host": "Dropbox", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "event detection, event detection, ad-hoc search, timeline generation, real-time summarization", + "Venue Title": "IRJ", + "Citations": "10.0", + "Venue Type": "journal", + "Venue Name": "Information Retrieval Journal", + "Authors": "Maram Hasanain,Reem Suwaileh,T. Elsayed,Mucahid Kutlu,H. Almerekhi", + "Affiliations": ",,,TOBB University of Economics and Technology,", + "Abstract": "This article introduces a new language-independent approach for creating a large-scale high-quality test collection of tweets that supports multiple information retrieval (IR) tasks without running a shared-task campaign. The adopted approach (demonstrated over Arabic tweets) designs the collection around significant (i.e., popular) events, which enables the development of topics that represent frequent information needs of Twitter users for which rich content exists. That inherently facilitates the support of multiple tasks that generally revolve around events, namely event detection, ad-hoc search, timeline generation, and real-time summarization. The key highlights of the approach include diversifying the judgment pool via interactive search and multiple manually-crafted queries per topic, collecting high-quality annotations via crowd-workers for relevancy and in-house annotators for novelty, filtering out low-agreement topics and inaccessible tweets, and providing multiple subsets of the collection for better availability. Applying our methodology on Arabic tweets resulted in EveTAR, the first freely-available tweet test collection for multiple IR tasks. EveTAR includes a crawl of 355M Arabic tweets and covers 50 significant events for which about 62K tweets were judged with substantial average inter-annotator agreement (Kappa value of 0.71). We demonstrate the usability of EveTAR by evaluating existing algorithms in the respective tasks. Results indicate that the new collection can support reliable ranking of IR systems that is comparable to similar TREC collections, while providing strong baseline results for future studies over Arabic tweets.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/exams.json b/datasets/exams.json new file mode 100644 index 0000000..2eca84d --- /dev/null +++ b/datasets/exams.json @@ -0,0 +1,36 @@ +{ + "Name": "EXAMS", + "Subsets": [], + "HF Link": "https://hf.co/datasets/mhardalov/exams", + "Link": "https://github.com/mhardalov/exams-qa", + "License": "CC BY-SA 4.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "more than 24,000 high-quality high school exam questions in 26 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others", + "Volume": "562", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "EXAMS: A Multi-Subject High School Examinations Dataset\r\nfor Cross-Lingual and Multilingual Question Answering", + "Paper Link": "https://arxiv.org/pdf/2011.03080.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "EMNLP", + "Citations": "3.0", + "Venue Type": "conference", + "Venue Name": "Conference on Empirical Methods in Natural Language Processing", + "Authors": "Momchil Hardalov,Todor Mihaylov,Dimitrina Zlatkova,Yoan Dinkov,Ivan Koychev,Preslav Nakov", + "Affiliations": ",,,Sofia University,,", + "Abstract": "We propose EXAMS -- a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others. \n EXAMS offers a fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of various models. We perform various experiments with existing top-performing multilingual pre-trained models and we show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible before. The data, code, pre-trained models, and evaluation are available at this https URL.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/fisher_levantine_arabic_conversational_telephone_speech,_transcripts.json b/datasets/fisher_levantine_arabic_conversational_telephone_speech,_transcripts.json new file mode 100644 index 0000000..a93a76a --- /dev/null +++ b/datasets/fisher_levantine_arabic_conversational_telephone_speech,_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "Fisher Levantine Arabic Conversational Telephone Speech, Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007T04", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "Fisher Levantine Arabic Conversational Telephone Speech, Transcripts contains transcripts for 279 telephone conversations. The majority of the speakers are from Jordan, Lebanon and Palestine.", + "Volume": "279", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/fisher_levantine_arabic_conversational_telephone_speech.json b/datasets/fisher_levantine_arabic_conversational_telephone_speech.json new file mode 100644 index 0000000..acb2afe --- /dev/null +++ b/datasets/fisher_levantine_arabic_conversational_telephone_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Fisher Levantine Arabic Conversational Telephone Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007S02", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Levantine Arabic QT Training Data Set 5, Speech was developed by the Linguistic Data Consortium (LDC) and contains 1,660 calls totalling approximately 250 hours of telephone conversation in Levantine Arabic.", + "Volume": "250", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/flodusta.json b/datasets/flodusta.json new file mode 100644 index 0000000..c9f8a6d --- /dev/null +++ b/datasets/flodusta.json @@ -0,0 +1,36 @@ +{ + "Name": "FloDusTA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/FloDusTA_Dust_Storm", + "Link": "https://github.com/BatoolHamawi/FloDusTA", + "License": "CC BY 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "FloDusTA is a dataset of annotated tweets collected for the purpose of developing an event detection system. The dataset contains tweets written in both the MSA and Saudi dialect. Labels are: flood, dust storm, traffic accident, and non-event.", + "Volume": "8,998", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Umm Al-Qura University - Saudi Arabia", + "Derived From": "nan", + "Paper Title": "FloDusTA: Saudi Tweets Dataset for Flood, Dust Storm, and Traffic Accident Events", + "Paper Link": "https://aclanthology.org/2020.lrec-1.174.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "event detection", + "Venue Title": "LREC", + "Citations": "2.0", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Btool Hamoui, Mourad Mars, Khaled Almotairi", + "Affiliations": "Umm Al-Qura University - Saudi Arabia", + "Abstract": "The rise of social media platforms makes it a valuable information source of recent events and users\u2019 perspective towards them. Twitter has been one of the most important communication platforms in recent years. Event detection, one of the information extraction aspects, involves identifying specified types of events in the text. Detecting events from tweets can help to predict real-world events precisely. A serious challenge that faces Arabic event detection is the lack of Arabic datasets that can be exploited in detecting events. This paper will describe FloDusTA, which is a dataset of tweets that we have built for the purpose of developing an event detection system. The dataset contains tweets written in both Modern Standard Arabic and Saudi dialect. The process of building the dataset starting from tweets collection to annotation by human annotators will be present. The tweets are labeled with four labels: flood, dust storm, traffic accident, and non-event. The dataset was tested for classification and the result was strongly encouraging.\n", + "Added By": "Mourad Mars" +} \ No newline at end of file diff --git a/datasets/flores-101.json b/datasets/flores-101.json new file mode 100644 index 0000000..4c3582a --- /dev/null +++ b/datasets/flores-101.json @@ -0,0 +1,36 @@ +{ + "Name": "FLORES-101", + "Subsets": [], + "HF Link": "https://hf.co/datasets/gsarti/flores_101", + "Link": "https://github.com/facebookresearch/flores/tree/master/floresv1/data", + "License": "CC BY-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Low Resource MT Benchmark", + "Volume": "3,100,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Facebook", + "Derived From": "OPTUS", + "Paper Title": "The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation", + "Paper Link": "https://arxiv.org/pdf/2106.03193.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "ArXiv", + "Citations": "1.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Naman Goyal,Cynthia Gao,Vishrav Chaudhary,Guillaume Wenzek,Da Ju,Sanjan Krishnan,Marc'Aurelio Ranzato,Francisco Guzm\u00e1n,Angela Fan", + "Affiliations": ",,,,Facebook AI Research,,,,", + "Abstract": "One of the biggest challenges hindering progress in low-resource and multilingual machine translation is the lack of good evaluation benchmarks. Current evaluation benchmarks either lack good coverage of low-resource languages, consider only restricted domains, or are low quality because they are constructed using semi-automatic procedures. In this work, we introduce the FLORES-101 evaluation benchmark, consisting of 3001 sentences extracted from English Wikipedia and covering a variety of different topics and domains. These sentences have been translated in 101 languages by professional translators through a carefully controlled process. The resulting dataset enables better assessment of model quality on the long tail of low-resource languages, including the evaluation of many-to-many multilingual translation systems, as all translations are multilingually aligned. By publicly releasing such a highquality and high-coverage dataset, we hope to foster progress in the machine translation community and beyond.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_1.json b/datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_1.json new file mode 100644 index 0000000..e683476 --- /dev/null +++ b/datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Parallel Aligned Treebank -- Broadcast News Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T14", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Please view the below samples.", + "Volume": "4,824", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation,information detection,cross-lingual information retrieval,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_2.json b/datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_2.json new file mode 100644 index 0000000..1c6a772 --- /dev/null +++ b/datasets/gale_arabic-english_parallel_aligned_treebank_--_broadcast_news_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Parallel Aligned Treebank -- Broadcast News Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T03", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The source data consists of Arabic broadcast news programming collected by LDC in 2007 and 2008 from Al Arabiya, Abu Dhabi TV, Al Baghdadya TV, Al Fayha, Alhurra, Al Iraqiyah, Aljazeera, Al Ordiniyah, Al Sharqiya, Dubai TV, Oman TV, Radio Sawa and Saudi TV. All data is encoded as UTF-8. A count of files, words, tokens and segments is below.", + "Volume": "141,058", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation,information detection,cross-lingual information retrieval,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_parallel_aligned_treebank_--_newswire.json b/datasets/gale_arabic-english_parallel_aligned_treebank_--_newswire.json new file mode 100644 index 0000000..c1a44c3 --- /dev/null +++ b/datasets/gale_arabic-english_parallel_aligned_treebank_--_newswire.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Parallel Aligned Treebank -- Newswire", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T10", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The source data consists of Arabic newswire from the Lebanese publication An Nahar collected by LDC in 2002. All data is encoded as UTF-8. A count of files, words, tokens and segments is below.", + "Volume": "7,711", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation,information detection,cross-lingual information retrieval,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_parallel_aligned_treebank_--_web_training.json b/datasets/gale_arabic-english_parallel_aligned_treebank_--_web_training.json new file mode 100644 index 0000000..ac5baa7 --- /dev/null +++ b/datasets/gale_arabic-english_parallel_aligned_treebank_--_web_training.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Parallel Aligned Treebank -- Web Training", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T08", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "Please view the following samples:", + "Volume": "69,766", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation,information detection,cross-lingual information retrieval,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_1.json b/datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_1.json new file mode 100644 index 0000000..b305877 --- /dev/null +++ b/datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Word Alignment -- Broadcast Training Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T19", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of Arabic source broadcast news and broadcast conversation data collected by LDC from 2007-2009. The distribution by genre, words, tokens and segments appears below:", + "Volume": "11,341", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_2.json b/datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_2.json new file mode 100644 index 0000000..a20a0b9 --- /dev/null +++ b/datasets/gale_arabic-english_word_alignment_--_broadcast_training_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Word Alignment -- Broadcast Training Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T22", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of Arabic source broadcast news and broadcast conversation data collected by LDC from 2007-2009. The distribution by genre, words, tokens and segments appears below:", + "Volume": "11,693", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_word_alignment_training_part_1_--_newswire_and_web.json b/datasets/gale_arabic-english_word_alignment_training_part_1_--_newswire_and_web.json new file mode 100644 index 0000000..6e7fdbb --- /dev/null +++ b/datasets/gale_arabic-english_word_alignment_training_part_1_--_newswire_and_web.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Word Alignment Training Part 1 -- Newswire and Web", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T05", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of Arabic source newswire and web data collected by LDC in 2006 - 2008. The distribution by genre, words, character tokens and segments appears below:", + "Volume": "12,806", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_word_alignment_training_part_2_--_newswire.json b/datasets/gale_arabic-english_word_alignment_training_part_2_--_newswire.json new file mode 100644 index 0000000..f5c6ab7 --- /dev/null +++ b/datasets/gale_arabic-english_word_alignment_training_part_2_--_newswire.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Word Alignment Training Part 2 -- Newswire", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T10", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of Arabic source newswire collected by LDC in 2004 - 2006 and 2008. The distribution by genre, words, character tokens and segments appears below:", + "Volume": "5,349", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation,information retrieval,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_arabic-english_word_alignment_training_part_3_--_web.json b/datasets/gale_arabic-english_word_alignment_training_part_3_--_web.json new file mode 100644 index 0000000..d7c2491 --- /dev/null +++ b/datasets/gale_arabic-english_word_alignment_training_part_3_--_web.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Arabic-English Word Alignment Training Part 3 -- Web", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T14", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of Arabic source web data collected by LDC. The distribution by genre, words, character tokens and segments appears below:", + "Volume": "7,332", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_1_arabic_blog_parallel_text.json b/datasets/gale_phase_1_arabic_blog_parallel_text.json new file mode 100644 index 0000000..7337978 --- /dev/null +++ b/datasets/gale_phase_1_arabic_blog_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 1 Arabic Blog Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2008T02", + "License": "LDC User Agreement for Non-Members", + "Year": 2008, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The task of preparing this corpus involved four stages of work: data scouting, data harvesting, formatting, and data selection.", + "Volume": "222", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "machine translation,language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_1.json b/datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_1.json new file mode 100644 index 0000000..2935691 --- /dev/null +++ b/datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 1 Arabic Broadcast News Parallel Text - Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007T24", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "A total of 17 hours of Arabic broadcast news recordings was selected from six sources and seven different programs.", + "Volume": "17", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "cross-lingual information retrieval,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_2.json b/datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_2.json new file mode 100644 index 0000000..ca6e656 --- /dev/null +++ b/datasets/gale_phase_1_arabic_broadcast_news_parallel_text_-_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 1 Arabic Broadcast News Parallel Text - Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2008T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2008, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "A total of 10.7 hours of Arabic broadcast news recordings were selected from four sources and four different programs.", + "Volume": "10.7", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "machine translation,cross-lingual information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_1.json b/datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_1.json new file mode 100644 index 0000000..5f14c1c --- /dev/null +++ b/datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 1 Arabic Newsgroup Parallel Text - Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T03", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Preparing the source data involved four stages of work: data scouting, data harvesting, formatting and data selection.", + "Volume": "264", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_2.json b/datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_2.json new file mode 100644 index 0000000..8603cce --- /dev/null +++ b/datasets/gale_phase_1_arabic_newsgroup_parallel_text_-_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 1 Arabic Newsgroup Parallel Text - Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Preparing the source data involved four stages of work: data scouting, data harvesting, formatting and data selection.", + "Volume": "263", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_1_distillation_training.json b/datasets/gale_phase_1_distillation_training.json new file mode 100644 index 0000000..27aa794 --- /dev/null +++ b/datasets/gale_phase_1_distillation_training.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 1 Distillation Training", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007T20", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The annotation task involves responding to a series of user queries. For each query, annotators first find relevant documents and identify snippets (strings of contiguous text that answer the query) in the Arabic, Chinese or English source document. Annotators then create a nugget for each fact expressed in the snippet. Semantically equivalent nuggets are grouped into cross-language, cross-document \"supernugs\". Judges at BAE Systems finally provide relevance weights for each supernug.", + "Volume": "81", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "topic detection and tracking,metadata extraction,message understanding,information retrieval,information extraction,distillation,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_1.json b/datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_1.json new file mode 100644 index 0000000..f73878f --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast Conversation Parallel Text Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012T06", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 2 Arabic Broadcast Conversation Parallel Text Part 1 includes 36 source-translation document pairs, comprising 169,109 words of Arabic source text and its English translation. Data is drawn from thirteen distinct Arabic programs broadcast between 2004 and 2007 from the following sources: Al Alam News Channel, a broadcaster located in Iran Aljazeera, a regional broadcast programmer based in Doha, Qatar Dubai TV, located in Dubai, United Arab Emirates Oman TV, a national broadcaster located in the Sultanate of Oman and Radio Sawa, a U.S, government-funded regional broadcaster. Broadcast conversation programming is generally more interactive than traditional news broadcasts and includes talk shows, interviews, call-in programs and roundtable discussions. The programs in this release focus on current events topics.", + "Volume": "169,109", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_2.json b/datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_2.json new file mode 100644 index 0000000..c44d100 --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_conversation_parallel_text_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast Conversation Parallel Text Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012T14", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 2 Arabic Broadcast Conversation Parallel Text Part 2 includes 29 source-translation document pairs, comprising 169,488 words of Arabic source text and its English translation. Data is drawn from eight distinct Arabic programs broadcast between 2004 and 2007 from Aljazeera, a regional broadcast programmer based in Doha, Qatar and Nile TV, an Egyptian broadcaster. Broadcast conversation programming is generally more interactive than traditional news broadcasts and includes talk shows, interviews, call-in programs and roundtables. The programs in this release focus on current events topics.", + "Volume": "169,488", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_1.json b/datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_1.json new file mode 100644 index 0000000..248a9d6 --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast Conversation Speech Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013S02", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast conversation recordings in this release feature interviews, call-in programs and round table discussions focusing principally on current events from the following sources: Al Alam News Channel, based in Iran, Al Arabiya, a news television station based in Dubai, Aljazeera, a regional broadcaster located in Doha, Qatar, Al Ordiniyah, a national broadcast station in Jordan, Lebanese Broadcasting Corporation, a Lebanese television station, Nile TV, a broadcast programmer based in Egypt, Oman TV, a national broadcaster located in the Sultanate of Oman, Saudi TV, a national television station based in Saudi Arabia and Syria TV, the national television station in Syria. A table showing the number of programs and hours recorded from each source is contained in the readme file.", + "Volume": "123", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_2.json b/datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_2.json new file mode 100644 index 0000000..5fdf164 --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_conversation_speech_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast Conversation Speech Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013S07", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast conversation recordings in this release feature interviews, call-in programs and roundtable discussions focusing principally on current events from the following sources: Abu Dhabi TV, based in Abu Dhabi, United Arab Emirates; Al Alam News Channel, based in Iran; Al Arabiya, a news television station based in Dubai; Aljazeera, a regional broadcaster located in Doha, Qatar; Lebanese Broadcasting Corporation, a Lebanese television station; Oman TV, a national broadcaster located in the Sultanate of Oman; Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria. A table showing the number of programs and hours recorded from each source is contained in the readme file.", + "Volume": "128", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_1.json b/datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_1.json new file mode 100644 index 0000000..200035d --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast Conversation Transcripts Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T04", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 752,747 tokens. The transcripts were created with the LDC-developed transcription tool, XTrans, a multi-platform, multilingual, multi-channel transcription tool that supports manual transcription and annotation of audio recordings. XTrans is available from the following link, http://www.ldc.upenn.edu/tools/XTrans/downloads/. ", + "Volume": "752,747", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_2.json b/datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_2.json new file mode 100644 index 0000000..67c8120 --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_conversation_transcripts_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast Conversation Transcripts Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T17", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 763,945 tokens. The transcripts were created with the LDC-developed transcription tool, XTrans, a multi-platform, multilingual, multi-channel transcription tool that supports manual transcription and annotation of audio recordings. XTrans is available from the following link, http://www.ldc.upenn.edu/tools/XTrans/downloads/. ", + "Volume": "763,945", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_news_parallel_text.json b/datasets/gale_phase_2_arabic_broadcast_news_parallel_text.json new file mode 100644 index 0000000..46454a3 --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_news_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast News Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 2 Arabic Broadcast News Parallel Text includes seven source-translation pairs, comprising 29,210 words of Arabic source text and its English translation. Data is drawn from six distinct Arabic programs broadcast between 2005 and 2007 from Abu Dhabi TV, based in Abu Dhabi, United Arab Emirates Al Alam News Channel, based in Iran Aljazeera, a regional broadcast programmer based in Doha, Qatar Dubai TV, based in Dubai, United Arab Emirates and Kuwait TV, a national television station based in Kuwait. The BN programming in this release focuses on current events topics.", + "Volume": "29,210", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_news_speech_part_1.json b/datasets/gale_phase_2_arabic_broadcast_news_speech_part_1.json new file mode 100644 index 0000000..462b4c3 --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_news_speech_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast News Speech Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014S07", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast recordings in this release feature news programs focusing principally on current events from the following sources: Abu Dhabi TV, a televisions station based in Abu Dhabi, United Arab Emirates; Al Alam News Channel, based in Iran; Alhurra, a U.S. government-funded regional broadcaster; Aljazeera, a regional broadcaster located in Doha, Qatar; Dubai TV, a broadcast station in the United Arab Emirates; Al Iraqiyah, an Iraqi television station; Kuwait TV, a national broadcast station in Kuwait; Lebanese Broadcasting Corporation, a Lebanese television station; Nile TV, a broadcast programmer based in Egypt; Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria.", + "Volume": "30,000", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_news_speech_part_2.json b/datasets/gale_phase_2_arabic_broadcast_news_speech_part_2.json new file mode 100644 index 0000000..7748142 --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_news_speech_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast News Speech Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015S01", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast recordings in this release feature news programs focusing principally on current events from the following sources: Abu Dhabi TV, a television station based in Abu Dhabi, United Arab Emirates; Al Alam News Channel, based in Iran; Aljazeera , a regional broadcaster located in Doha, Qatar; Al Ordiniyah, a national broadcast station in Jordan; Dubai TV, based in Dubai, United Arab Emirates; Al Iraqiyah, a television network based in Iraq; Kuwait TV, a national television station based in Kuwait; Lebanese Broadcasting Corporation, a Lebanese television station; Nile TV, a broadcast programmer based in Egypt; Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria.", + "Volume": "170", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_1.json b/datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_1.json new file mode 100644 index 0000000..74685ef --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast News Transcripts Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T17", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 897,868 tokens. The transcripts were created with the LDC-developed transcription tool, XTrans, a multi-platform, multilingual, multi-channel transcription tool that supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans.", + "Volume": "897,868", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_2.json b/datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_2.json new file mode 100644 index 0000000..db0dc1a --- /dev/null +++ b/datasets/gale_phase_2_arabic_broadcast_news_transcripts_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Broadcast News Transcripts Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015T01", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 920,730 tokens. The transcripts were created with the LDC-developed transcription tool, XTrans, a multi-platform, multilingual, multi-channel transcription tool that supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans.", + "Volume": "920,730", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_newswire_parallel_text.json b/datasets/gale_phase_2_arabic_newswire_parallel_text.json new file mode 100644 index 0000000..a703f01 --- /dev/null +++ b/datasets/gale_phase_2_arabic_newswire_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Newswire Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012T17", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 2 Arabic Newswire Parallel Text includes 400 source-translation pairs, comprising 181,704 tokens of Arabic source text and its English translation. Data is drawn from six distinct Arabic newswire sources.: Al Ahram, Al Hayat, Al-Quds Al-Arabi, An Nahar, Asharq Al-Awsat and Assabah.", + "Volume": "181,704", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_2_arabic_web_parallel_text.json b/datasets/gale_phase_2_arabic_web_parallel_text.json new file mode 100644 index 0000000..10b15da --- /dev/null +++ b/datasets/gale_phase_2_arabic_web_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 2 Arabic Web Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T01", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 2 Arabic Web Parallel Text includes 60 source-translation document pairs, comprising 42,089 words of Arabic source text and its English translation. Data was drawn from various Arabic weblog and newsgroup sources.", + "Volume": "42,089", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_and_4_arabic_broadcast_conversation_parallel_text.json b/datasets/gale_phase_3_and_4_arabic_broadcast_conversation_parallel_text.json new file mode 100644 index 0000000..02564aa --- /dev/null +++ b/datasets/gale_phase_3_and_4_arabic_broadcast_conversation_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 and 4 Arabic Broadcast Conversation Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015T05", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 3 and 4 Arabic Broadcast Conversation Parallel Text includes 55 source-translation document pairs, comprising 280,535 words of Arabic source text and its English translation. Data is drawn from 22 distinct Arabic programs broadcast between 2006 and 2008 from Al Alam News Channel, based in Iran; Al Arabiya, a news television station based in Dubai; Al Baghdadya, an Iraqi broadcaster; Al Fayhaa, a television channel in Iraq; Al Hiwar TV, based on London, United Kingdom; Aljazeera, a regional broadcaster located in Doha, Qatar; Bahrain TV, based in the Kingdom of Bahrain; Nile TV, a broadcast programmer based in Egypt; Oman TV, a national broadcaster located in the Sultanate of Oman; Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria. Broadcast conversation programming is generally more interactive than traditional news broadcasts and includes talk shows, interviews, call-in programs and roundtables.", + "Volume": "280,535", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_and_4_arabic_broadcast_news_parallel_text.json b/datasets/gale_phase_3_and_4_arabic_broadcast_news_parallel_text.json new file mode 100644 index 0000000..8e87e59 --- /dev/null +++ b/datasets/gale_phase_3_and_4_arabic_broadcast_news_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 and 4 Arabic Broadcast News Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015T07", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 3 and 4 Arabic Broadcast News Parallel Text includes 86 source-translation document pairs, comprising 325,538 words of Arabic source text and its English translation. Data is drawn from 28 distinct Arabic programs broadcast between 2007 and 2008 from Abu Dhabi TV, a television station based in Abu Dhabi, United Arab Emirates; Al Alam News Channel, based in Iran; Al Arabiya, a news television station based in Dubai; Al Baghdadya, an Iraqi broadcaster; Alhurra, a U.S.-government funded regional broadcaster; Al Iraqiyah, an Iraqi television station; Aljazeera, a regional broadcaster located in Doha, Qatar; Al Ordiniyah, a national broadcast station in Jordan; Al Sharqiya, an Iraqi broadcast programmer; Dubai TV, a broadcast station in the United Arab Emirates; Kuwait TV, a national broadcast station based in Kuwait; Lebanese Broadcasting Corporation, a Lebanese television station; Oman TV, a national broadcaster located in the Sultanate of Oman; Radio Sawa, a U.S.-government funded regional broadcaster; Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria. Broadcast news programming consists of news programs focusing principally on current events.", + "Volume": "325,538", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_and_4_arabic_newswire_parallel_text.json b/datasets/gale_phase_3_and_4_arabic_newswire_parallel_text.json new file mode 100644 index 0000000..a39ef37 --- /dev/null +++ b/datasets/gale_phase_3_and_4_arabic_newswire_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 and 4 Arabic Newswire Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015T19", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 3 and 4 Arabic Newswire Parallel Text includes 551 source-translation document pairs, comprising 156,775 tokens of Arabic source text and its English translation. Data is drawn from seven distinct Arabic newswire sources: Agence France Presse, Al Ahram, Al Hayat, Al-Quds Al-Arabi, An Nahar, Asharq Al-Awsat and Assabah.", + "Volume": "156,775", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_and_4_arabic_web_parallel_text.json b/datasets/gale_phase_3_and_4_arabic_web_parallel_text.json new file mode 100644 index 0000000..11bde1e --- /dev/null +++ b/datasets/gale_phase_3_and_4_arabic_web_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 and 4 Arabic Web Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T08", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 3 and 4 Arabic Web Parallel Text includes 124 source-translation document pairs, comprising 61,662 tokens of Arabic source text and its English translation. Data is drawn from various Arabic weblog and newsgroup sources.", + "Volume": "61,662", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_1.json b/datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_1.json new file mode 100644 index 0000000..9397860 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast Conversation Speech Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015S11", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast conversation recordings in this release feature interviews, call-in programs and roundtable discussions focusing principally on current events from the following sources: Abu Dhabi TV, a television station based in Abu Dhabi, United Arab Emirates; Al Alam News Channel, based in Iran; Al Arabiya, a news television station based in Dubai; Aljazeera, a regional broadcaster located in Doha, Qatar; Al Ordiniyah, a national broadcast station in Jordan; Dubai TV, a broadcast station in the United Arab Emirates; Lebanese Broadcasting Corporation, a Lebanese television station; Oman TV, a national broadcaster located in the Sultanate of Oman; Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria.", + "Volume": "123", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_2.json b/datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_2.json new file mode 100644 index 0000000..da65329 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_conversation_speech_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast Conversation Speech Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016S01", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast conversation recordings in this release feature interviews, call-in programs and roundtable discussions focusing principally on current events from the following sources: Abu Dhabi TV, a television station based in Abu Dhabi, United Arab Emirates; Al Alam News Channel, based in Iran; Al Arabiya, a news television station based in Dubai; Al Baghdadya, an Iraqi broadcast programmer based in Egypt; Al Fayha, an Iraqi television channel; Al Hiwar, a regional broadcast station based in the United Kingdom; Alhurra, a U.S. government-funded regional broadcaster; Aljazeera, a regional broadcaster located in Doha, Qatar; Al Ordiniyah, a national broadcast station in Jordan; Bahrain TV, a television station in the Kingdom of Bahrain; Dubai TV, a broadcast station in the United Arab Emirates; Kuwait TV, a national broadcast station in Kuwait; Oman TV, a national broadcaster located in the Sultanate of Oman ; Qatar TV, a broadcast programmer in Qatar; Saudi TV, a national television station based in Saudi Arabia; Syria TV, the national television station in Syria; and Tunisian National TV, a national television station in Tunisia.", + "Volume": "129", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_1.json b/datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_1.json new file mode 100644 index 0000000..2ee2a84 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast Conversation Transcripts Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015T16", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 733,233 tokens. The transcripts were created with the LDC-developed transcription tool, XTrans, a multi-platform, multilingual, multi-channel transcription tool that supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans.", + "Volume": "733,233", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_2.json b/datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_2.json new file mode 100644 index 0000000..8ef62c4 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_conversation_transcripts_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast Conversation Transcripts Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T06", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 845,791 tokens. The transcripts were created with the LDC tool, XTrans, which supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans.", + "Volume": "845,791", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_news_speech_part_1.json b/datasets/gale_phase_3_arabic_broadcast_news_speech_part_1.json new file mode 100644 index 0000000..9d8c9c8 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_news_speech_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast News Speech Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016S07", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast news recordings in this release feature news broadcasts focusing principally on current events from the following sources: Abu Dhabi TV, a television station based in Abu Dhabi, Al Alam News Channel, based in Iran; Al Arabiya, a news television station based in Dubai; Al Iraqiyah, an Iraqi television station; Aljazeera , a regional broadcaster located in Doha, Qatar; Al Ordiniyah, a national broadcast station in Jordan; Dubai TV, a broadcast station in the United Arab Emirates; Kuwait TV, a national broadcast station in Kuwait; Lebanese Broadcasting Corporation, a Lebanese television station; Nile TV, a broadcast programmer based in Egypt, Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria.", + "Volume": "132", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_news_speech_part_2.json b/datasets/gale_phase_3_arabic_broadcast_news_speech_part_2.json new file mode 100644 index 0000000..0d8f809 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_news_speech_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast News Speech Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017S02", + "License": "LDC User Agreement for Non-Members", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The recordings in this release feature news broadcasts focusing principally on current events from the following sources: Abu Dhabi TV, United Arab Emirates; Al Alam News Channel, based in Iran; Al Arabiya, a news television station based in Dubai; Al Iraqiyah, an Iraqi television station; Aljazeera, a regional broadcaster located in Doha, Qatar; Al-Manar TV, a broadcast programmer located in Lebanon; Al Ordiniyah, a national broadcast station in Jordan; Al Sharqiya, an Iraqi television station; Dubai TV, a broadcast station in the United Arab Emirates; Kuwait TV, a national broadcast station in Kuwait; Nile TV, a broadcast programmer based in Egypt; Oman TV, a national broadcaster located in the Sultanate of Oman; Saudi TV, a national television station based in Saudi Arabia; and Syria TV, the national television station in Syria.", + "Volume": "128", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_1.json b/datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_1.json new file mode 100644 index 0000000..6446c83 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast News Transcripts Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T17", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 741,689 tokens. The transcripts were created with the LDC tool, XTrans, which supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans.", + "Volume": "741,689", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_2.json b/datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_2.json new file mode 100644 index 0000000..38f28c7 --- /dev/null +++ b/datasets/gale_phase_3_arabic_broadcast_news_transcripts_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 3 Arabic Broadcast News Transcripts Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017T04", + "License": "LDC User Agreement for Non-Members", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 721,846 tokens. The transcripts were created with the LDC tool, XTrans, which supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans.", + "Volume": "721,846", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_broadcast_conversation_parallel_sentences.json b/datasets/gale_phase_4_arabic_broadcast_conversation_parallel_sentences.json new file mode 100644 index 0000000..4014119 --- /dev/null +++ b/datasets/gale_phase_4_arabic_broadcast_conversation_parallel_sentences.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Broadcast Conversation Parallel Sentences", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T11", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 4 Arabic Broadcast Conversation Parallel Sentences includes 170 source-translation document pairs, comprising 44,064 words (Arabic source) of translated data. Data is drawn from 45 distinct Arabic broadcast conversation (BC) sources. BC programming is more interactive than traditional broadcast news sources and may include talk shows, interviews, call-in programs and roundtables.", + "Volume": "44,064", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_broadcast_conversation_speech.json b/datasets/gale_phase_4_arabic_broadcast_conversation_speech.json new file mode 100644 index 0000000..a909f02 --- /dev/null +++ b/datasets/gale_phase_4_arabic_broadcast_conversation_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Broadcast Conversation Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017S15", + "License": "LDC User Agreement for Non-Members", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The broadcast conversation recordings in this release feature interviews, call-in programs and roundtable discussions focusing principally on current events from the following sources: Al Alam News Channel, based in Iran; Al Fayhaa, an Iraqi television channel; Al Hiwar, a regional broadcast station based in the United Kingdom; Alnurra, a U.S. government-funded regional broadcaster; Aljazeera, a regional broadcaster located in Doha, Qatar; Al Ordiniyah, a national broadcast station in Jordan; Dubai TV, a broadcast station in the United Arab Emirates; Lebanese Broadcasting Corporation, a Lebanese television station; Saudi TV, a national television station based in Saudi Arabia; Syria TV, the national television station in Syria; and Tunisian National TV, a national television station in Tunisia.", + "Volume": "30,000", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_broadcast_conversation_transcripts.json b/datasets/gale_phase_4_arabic_broadcast_conversation_transcripts.json new file mode 100644 index 0000000..ec5f331 --- /dev/null +++ b/datasets/gale_phase_4_arabic_broadcast_conversation_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Broadcast Conversation Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017T12", + "License": "LDC User Agreement for Non-Members", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 475,211 tokens. The transcripts were created with the LDC tool XTrans, which supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans.", + "Volume": "475,211", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "750.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_broadcast_news_parallel_sentences.json b/datasets/gale_phase_4_arabic_broadcast_news_parallel_sentences.json new file mode 100644 index 0000000..63fb4b4 --- /dev/null +++ b/datasets/gale_phase_4_arabic_broadcast_news_parallel_sentences.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Broadcast News Parallel Sentences", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T20", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 4 Arabic Broadcast News Parallel Sentences includes 106 source-translation document pairs, comprising 114,251 words (Arabic source) of translated data. Data is drawn from 24 distinct Arabic programs featuring news broadcasts.", + "Volume": "114,251", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_broadcast_news_speech.json b/datasets/gale_phase_4_arabic_broadcast_news_speech.json new file mode 100644 index 0000000..3096108 --- /dev/null +++ b/datasets/gale_phase_4_arabic_broadcast_news_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Broadcast News Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018S05", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The recordings in this release feature news broadcasts focusing principally on current events from the following sources: Abu Dhabi TV, a television station based in Abu Dhabi, United Arab Emirates; Al Arabiya, a news television station based in Dubai; Al Baghdadya , an Iraqi broadcast programmer; Alhurra, a U.S. government-funded regional broadcaster; Al Iraqiyah, an Iraqi television station; Aljazeera , a regional broadcaster located in Doha, Qatar; Al Ordiniyah, a national broadcast station in Jordan; Kuwait TV, a national broadcast station based in Kuwait; Radio Sawa, a U.S. government-funded regional broadcaster; Saudi TV, a national television station based in Saudi Arabia; Syria TV, the national television station in Syria; and Yemen TV, a television station based in Yemen.", + "Volume": "37", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_broadcast_news_transcripts.json b/datasets/gale_phase_4_arabic_broadcast_news_transcripts.json new file mode 100644 index 0000000..59fe6d9 --- /dev/null +++ b/datasets/gale_phase_4_arabic_broadcast_news_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Broadcast News Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018T14", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The transcript files are in plain-text, tab-delimited format (TDF) with UTF-8 encoding, and the transcribed data totals 204,735 tokens. The transcripts were created with the LDC tool XTrans, which supports manual transcription and annotation of audio recordings. XTrans is available from the following link, https://www.ldc.upenn.edu/language-resources/tools/xtrans/downloads.", + "Volume": "204,735", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "750.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_newswire_parallel_sentences.json b/datasets/gale_phase_4_arabic_newswire_parallel_sentences.json new file mode 100644 index 0000000..38d2c66 --- /dev/null +++ b/datasets/gale_phase_4_arabic_newswire_parallel_sentences.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Newswire Parallel Sentences", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T27", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 4 Arabic Newswire Parallel Sentences includes 393 source-translation document pairs, comprising 62,669 words (Arabic source) of translated data. Data is drawn from six distinct Arabic newswire sources.", + "Volume": "62,669", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gale_phase_4_arabic_weblog_parallel_sentences.json b/datasets/gale_phase_4_arabic_weblog_parallel_sentences.json new file mode 100644 index 0000000..36d047d --- /dev/null +++ b/datasets/gale_phase_4_arabic_weblog_parallel_sentences.json @@ -0,0 +1,36 @@ +{ + "Name": "GALE Phase 4 Arabic Weblog Parallel Sentences", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T14", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "GALE Phase 4 Arabic Weblog Parallel Sentences includes 1,067 source-translation document pairs, comprising 68,346 words (Arabic source) of translated data. Data is drawn from various Arabic newsgroup and weblog sources.", + "Volume": "68,346", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,750.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gem.json b/datasets/gem.json new file mode 100644 index 0000000..6aa62b0 --- /dev/null +++ b/datasets/gem.json @@ -0,0 +1,36 @@ +{ + "Name": "GEM", + "Subsets": [], + "HF Link": "https://hf.co/datasets/gem/xlsumm", + "Link": "https://gem-benchmark.com/", + "License": "Apache-2.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "benchmark environment for Natural Language Generation with a focus on its Evaluation, both through human annotations and automated Metric", + "Volume": "29,229", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "MLSUM, XSUM, Wikilingua,WebNLG, CommonGen, E2E,DART, Czech Restaurant, ToTTo, wiki-Auto,TirkCorpus, ASSET, Schema-Guided Dialog", + "Paper Title": "The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics", + "Paper Link": "https://aclanthology.org/2021.gem-1.10.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "summarization", + "Venue Title": "GEM", + "Citations": "27.0", + "Venue Type": "workshop", + "Venue Name": "Generation Evaluation and Metrics ", + "Authors": "Sebastian Gehrmann,Tosin P. Adewumi,Karmanya Aggarwal,Pawan Sasanka Ammanamanchi,Aremu Anuoluwapo,Antoine Bosselut,Khyathi Raghavi Chandu,Miruna Adriana Clinciu,Dipanjan Das,Kaustubh D. Dhole,Wanyu Du,Esin Durmus,Ondrej Dusek,Chris C. Emezue,Varun Gangal,Cristina Garbacea,Tatsunori B. Hashimoto,Yufang Hou,Yacine Jernite,Harsh Jhamtani,Yangfeng Ji,Shailza Jolly,Mihir Kale,Dhruv Kumar,Faisal Ladhak,Aman Madaan,Mounica Maddela,Khyati Mahajan,Saad Mahamood,Bodhisattwa Prasad Majumder,Pedro Henrique Martins,Angelina McMillan-Major,Simon Mille,Emiel van Miltenburg,Moin Nadeem,Shashi Narayan,Vitaly Nikolaev,Rubungo Andre Niyongabo,Salomey Osei,Ankur P. Parikh,Laura Perez-Beltrachini,Niranjan Rao,Vikas Raunak,Juan Diego Rodr\u00edguez,Sashank Santhanam,Jo\u00e3o Sedoc,Thibault Sellam,Samira Shaikh,Anastasia Shimorina,Marco Antonio Sobrevilla Cabezudo,Hendrik Strobelt,Nishant Subramani,W. Xu,Diyi Yang,Akhila Yerukola,Jiawei Zhou", + "Affiliations": ",,,,,EPFL,,Edinburgh Centre for Robotics,,,,Stanford University,Charles University,,Carnegie Mellon University,,,,FAIR,Carnegie Mellon University,University of Virginia,,,,,,,,,University of California San Diego,,,,Tilburg University,Massachusetts Institute of Technology;MIT,,,,,,,,,The University of Texas at Austin,,,,,,Institute of Mathematics and Computer Sciences;University of S\u00e3o Paulo;Pontifical Catholic University of Peru,,Allen Institute for AI;Masakhane,,,,", + "Abstract": "We introduce GEM, a living benchmark for natural language Generation (NLG), its Evaluation, and Metrics. Measuring progress in NLG relies on a constantly evolving ecosystem of automated metrics, datasets, and human evaluation standards. Due to this moving target, new models often still evaluate on divergent anglo-centric corpora with well-established, but flawed, metrics. This disconnect makes it challenging to identify the limitations of current models and opportunities for progress. Addressing this limitation, GEM provides an environment in which models can easily be applied to a wide set of tasks and in which evaluation strategies can be tested. Regular updates to the benchmark will help NLG research become more multilingual and evolve the challenge alongside models. This paper serves as the description of the data for the 2021 shared task at the associated GEM Workshop.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/gem_-_wikilingua.json b/datasets/gem_-_wikilingua.json new file mode 100644 index 0000000..9f0ecdc --- /dev/null +++ b/datasets/gem_-_wikilingua.json @@ -0,0 +1,36 @@ +{ + "Name": "GEM - WikiLingua", + "Subsets": [], + "HF Link": "https://hf.co/datasets/esdurmus/wiki_lingua", + "Link": "https://github.com/esdurmus/Wikilingua", + "License": "CC0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "New Benchmark Dataset for Multilingual Abstractive Summarization", + "Volume": "29,229", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "WikiLingua: A New Benchmark Dataset for Cross-Lingual Abstractive Summarization", + "Paper Link": "WikiLingua: A New Benchmark Dataset for Cross-Lingual Abstractive Summarization", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "summarization", + "Venue Title": "EMNLP", + "Citations": "9.0", + "Venue Type": "conference", + "Venue Name": "Conference on Empirical Methods in Natural Language Processing", + "Authors": "Faisal Ladhak,Esin Durmus,Claire Cardie,K. McKeown", + "Affiliations": ",Stanford University,,", + "Abstract": "We introduce WikiLingua, a large-scale, multilingual dataset for the evaluation of crosslingual abstractive summarization systems. We extract article and summary pairs in 18 languages from WikiHow, a high quality, collaborative resource of how-to guides on a diverse set of topics written by human authors. We create gold-standard article-summary alignments across languages by aligning the images that are used to describe each how-to step in an article. As a set of baselines for further studies, we evaluate the performance of existing cross-lingual abstractive summarization methods on our dataset. We further propose a method for direct crosslingual summarization (i.e., without requiring translation at inference time) by leveraging synthetic data and Neural Machine Translation as a pre-training step. Our method significantly outperforms the baseline approaches, while being more cost efficient during inference.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/gem_-_xlsum.json b/datasets/gem_-_xlsum.json new file mode 100644 index 0000000..7bb6b29 --- /dev/null +++ b/datasets/gem_-_xlsum.json @@ -0,0 +1,36 @@ +{ + "Name": "GEM - XLSum", + "Subsets": [], + "HF Link": "https://hf.co/datasets/GEM/xlsum", + "Link": "https://github.com/csebuetnlp/xl-sum", + "License": "CC BY-NC-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Large-Scale Multilingual Abstractive Summarization for 44 Languages\" ", + "Volume": "46,897", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44\nLanguages", + "Paper Link": "https://aclanthology.org/2021.findings-acl.413.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "summarization", + "Venue Title": "FINDINGS", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "Findings of the Association for Computational Linguistics", + "Authors": "Tahmid Hasan,Abhik Bhattacharjee,Md. Saiful Islam,Kazi Samin,Yuan-Fang Li,Yong-Bin Kang,M. Rahman,Rifat Shahriyar", + "Affiliations": ",,,,,,,", + "Abstract": "Contemporary works on abstractive text summarization have focused primarily on highresource languages like English, mostly due to the limited availability of datasets for low/midresource ones. In this work, we present XLSum, a comprehensive and diverse dataset comprising 1 million professionally annotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics. The dataset covers 44 languages ranging from low to high-resource, for many of which no public dataset is currently available. XL-Sum is highly abstractive, concise, and of high quality, as indicated by human and intrinsic evaluation. We fine-tune mT5, a state-of-theart pretrained multilingual model, with XLSum and experiment on multilingual and lowresource summarization tasks. XL-Sum induces competitive results compared to the ones obtained using similar monolingual datasets: we show higher than 11 ROUGE-2 scores on 10 languages we benchmark on, with some of them exceeding 15, as obtained by multilingual training. Additionally, training on low-resource languages individually also provides competitive performance. To the best of our knowledge, XL-Sum is the largest abstractive summarization dataset in terms of the number of samples collected from a single source and the number of languages covered. We are releasing our dataset and models to encourage future research on multilingual abstractive summarization. The resources can be found at https://github. com/csebuetnlp/xl-sum.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/geowac.json b/datasets/geowac.json new file mode 100644 index 0000000..dc23aa4 --- /dev/null +++ b/datasets/geowac.json @@ -0,0 +1,133 @@ +{ + "Name": "GeoWAC", + "Subsets": [ + { + "Name": "United_Arab_Emirates ", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "102,370,313", + "Unit": "tokens" + }, + { + "Name": "Palestine ", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "97,161,325", + "Unit": "tokens" + }, + { + "Name": "Morocco ", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "78,749,159", + "Unit": "tokens" + }, + { + "Name": "Syria", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "24,308,507", + "Unit": "tokens" + }, + { + "Name": "Saudi_Arabia ", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "14,651,603", + "Unit": "tokens" + }, + { + "Name": "Algeria", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "11,533,390", + "Unit": "tokens" + }, + { + "Name": "Qatar ", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "10,783,360", + "Unit": "tokens" + }, + { + "Name": "Jordan", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "8,638,264", + "Unit": "tokens" + }, + { + "Name": "Oman", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "5,508,084", + "Unit": "tokens" + }, + { + "Name": "Bahrain", + "Dialect": "ar-BH: (Arabic (Bahrain))", + "Volume": "4,524,743", + "Unit": "tokens" + }, + { + "Name": "Iraq ", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "4,497,374", + "Unit": "tokens" + }, + { + "Name": "Tunisia", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "3,581,885", + "Unit": "tokens" + }, + { + "Name": "Sudan", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "2,522,356", + "Unit": "tokens" + }, + { + "Name": "Egypt", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "2,465,722", + "Unit": "tokens" + }, + { + "Name": "Djibouti ", + "Dialect": "ar-DJ: (Arabic (Djibouti))", + "Volume": "1,468,411", + "Unit": "tokens" + }, + { + "Name": "Somalia", + "Dialect": "ar-SO: (Arabic (Somalia))", + "Volume": "255,511", + "Unit": "tokens" + } + ], + "HF Link": "nan", + "Link": "https://github.com/jonathandunn/earthLings", + "License": "GPL-2.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "match the ground-truth geographic\r\ndistribution of each language", + "Volume": "373,020,007", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "University of Canterbury", + "Derived From": "nan", + "Paper Title": "Geographically-Balanced Gigaword Corpora for 50 Language Varieties\r", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.308.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "LREC", + "Citations": "5.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Jonathan Dunn,B. Adams", + "Affiliations": "University of Canterbury,", + "Abstract": "While text corpora have been steadily increasing in overall size, even very large corpora are not designed to represent global population demographics. For example, recent work has shown that existing English gigaword corpora over-represent inner-circle varieties from the US and the UK. To correct implicit geographic and demographic biases, this paper uses country-level population demographics to guide the construction of gigaword web corpora. The resulting corpora explicitly match the ground-truth geographic distribution of each language, thus equally representing language users from around the world. This is important because it ensures that speakers of under-resourced language varieties (i.e., Indian English or Algerian French) are represented, both in the corpora themselves but also in derivative resources like word embeddings.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/glare.json b/datasets/glare.json new file mode 100644 index 0000000..a04dadb --- /dev/null +++ b/datasets/glare.json @@ -0,0 +1,36 @@ +{ + "Name": "GLARE", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Fatima-Gh/GLARE", + "Link": "https://zenodo.org/record/6457824", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling", + "Description": "GLARE an Arabic Apps Reviews dataset collected from Saudi Google PlayStore. It consists of 76M reviews, 69M of which are Arabic reviews of 9,980 Android Applications. We present the data collection methodology, along with a detailed Exploratory Data Analysis (EDA) and Feature Engineering on the gathered reviews. We also highlight possible use cases and benefits of the dataset.", + "Volume": "76,000,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "SDAIA", + "Derived From": "nan", + "Paper Title": "GLARE: Google Apps Arabic Reviews Dataset", + "Paper Link": "https://zenodo.org/record/6457824", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "review classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "AlGhamdi, Fatima; Mohammed, Reem; Al-Khalifa, Hend; Alowisheq, Areeb", + "Affiliations": "nan", + "Abstract": "This paper introduces GLARE an Arabic Apps Reviews dataset collected from Saudi Google PlayStore. It consists of 76M reviews,\n69M of which are Arabic reviews of 9,980 Android Applications. We present the data collection methodology, along with a detailed\nExploratory Data Analysis (EDA) and Feature Engineering on the gathered reviews. We also highlight possible use cases and benefits\nof the dataset.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gnome.json b/datasets/gnome.json new file mode 100644 index 0000000..e3e3b00 --- /dev/null +++ b/datasets/gnome.json @@ -0,0 +1,36 @@ +{ + "Name": "GNOME", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/opus_gnome", + "Link": "https://opus.nlpl.eu/GNOME.php", + "License": "unknown", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "A parallel corpus of GNOME localization files", + "Volume": "800,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "Parallel Data, Tools and Interfaces in OPUS", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources Evaluation Conference", + "Authors": "Jorg Tiedemann", + "Affiliations": "Department of Linguistics and Philology Uppsala University", + "Abstract": "This paper presents the current status of OPUS, a growing language resource of parallel corpora and related tools. The focus in OPUS\nis to provide freely available data sets in various formats together with basic annotation to be useful for applications in computational\nlinguistics, translation studies and cross-linguistic corpus studies. In this paper, we report about new data sets and their features,\nadditional annotation tools and models provided from the website and essential interfaces and on-line services included in the project.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/goud-sum.json b/datasets/goud-sum.json new file mode 100644 index 0000000..88c69b4 --- /dev/null +++ b/datasets/goud-sum.json @@ -0,0 +1,36 @@ +{ + "Name": "Goud-sum", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Goud/Goud-sum", + "Link": "https://github.com/issam9/goud-summarization-dataset", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "Goud-sum contains 158k articles and their headlines extracted from Goud.ma news website. The articles are written in the Arabic script. All headlines are in Moroccan Darija, while articles may be in Moroccan Darija, in Modern Standard Arabic, or a mix of both (code-switched Moroccan Darija).", + "Volume": "158,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "GOUD.MA: A NEWS ARTICLE DATASET FOR SUMMARIZATION IN MOROCCAN DARIJA", + "Paper Link": "https://openreview.net/pdf?id=BMVq5MELb9", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "summarization", + "Venue Title": "ICLR", + "Citations": "0.0", + "Venue Type": "workshop", + "Venue Name": "International Conference on Learning Representations", + "Authors": "Abderrahmane Issam, Khalil Mrini", + "Affiliations": "Archipel Cognitive, University of California San Diego", + "Abstract": "Moroccan Darija is a vernacular spoken by over 30 million people primarily in\nMorocco. Despite a high number of speakers, it remains a low-resource language.\nIn this paper, we introduce GOUD.MA: a dataset of over 158k news articles for automatic summarization in code-switched Moroccan Darija. We analyze the dataset\nand find that it requires a high level of abstractive reasoning. We fine-tune the\nArabic-language BERT (AraBERT), and the language models for the Moroccan\n(DarijaBERT), and Algerian (DziriBERT) national vernaculars for summarization\non GOUD.MA. The results show that GOUD.MA is a challenging summarization\nbenchmark dataset. We release our dataset publicly in an effort to encourage the\ndiversity of evaluation tasks to improve language modeling in Moroccan Darija.", + "Added By": "Abderrahmane Issam" +} \ No newline at end of file diff --git a/datasets/gulf_arabic_conversational_telephone_speech,_transcripts.json b/datasets/gulf_arabic_conversational_telephone_speech,_transcripts.json new file mode 100644 index 0000000..02590b7 --- /dev/null +++ b/datasets/gulf_arabic_conversational_telephone_speech,_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "Gulf Arabic Conversational Telephone Speech, Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T15", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "Each transcript file is a tab-delimited flat table, where each line contains information and text for a single contiguous utterance, presented via the following fields:", + "Volume": "976", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "400.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gulf_arabic_conversational_telephone_speech.json b/datasets/gulf_arabic_conversational_telephone_speech.json new file mode 100644 index 0000000..5492413 --- /dev/null +++ b/datasets/gulf_arabic_conversational_telephone_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Gulf Arabic Conversational Telephone Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006S43", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "A total of 976 conversation sides from 975 Gulf Arabic speakers are provided (one speaker appears on two distinct calls). Most of the calls contain both sides of a conversation (that is, 450 two-channel recordings plus 76 single-channel recordings). The average duration per side is about 5.7 minutes.", + "Volume": "46.66", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "800.00 $", + "Test Split": "No", + "Tasks": "speaker identification,speech recognition,spoken dialogue modeling,spoken dialogue systems", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/gumar.json b/datasets/gumar.json new file mode 100644 index 0000000..2a29327 --- /dev/null +++ b/datasets/gumar.json @@ -0,0 +1,85 @@ +{ + "Name": "Gumar", + "Subsets": [ + { + "Name": "SA", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "748", + "Unit": "documents" + }, + { + "Name": "AE", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "165", + "Unit": "documents" + }, + { + "Name": "KW", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Volume": "73", + "Unit": "documents" + }, + { + "Name": "OM", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "14", + "Unit": "documents" + }, + { + "Name": "QA", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "8", + "Unit": "documents" + }, + { + "Name": "BH", + "Dialect": "ar-BH: (Arabic (Bahrain))", + "Volume": "6", + "Unit": "documents" + }, + { + "Name": "GA", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "123", + "Unit": "documents" + }, + { + "Name": "Arabic", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "98", + "Unit": "documents" + } + ], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/gumar/?page=download&lang=en", + "License": "custom", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a large-scale corpus of Gulf Arabic consisting of 110 million words from 1,200 forum novels", + "Volume": "1,236", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "A Large Scale Corpus of Gulf Arabic\r", + "Paper Link": "https://aclanthology.org/L16-1679.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "morphological analysis", + "Venue Title": "LREC", + "Citations": "37.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Salam Khalifa,Nizar Habash,D. Abdulrahim,Sara Hassan", + "Affiliations": "New York University Abu Dhabi,,,", + "Abstract": "Most Arabic natural language processing tools and resources are developed to serve Modern Standard Arabic (MSA), which is the official written language in the Arab World. Some Dialectal Arabic varieties, notably Egyptian Arabic, have received some attention lately and have a growing collection of resources that include annotated corpora and morphological analyzers and taggers. Gulf Arabic, however, lags behind in that respect. In this paper, we present the Gumar Corpus, a large-scale corpus of Gulf Arabic consisting of 110 million words from 1,200 forum novels. We annotate the corpus for sub-dialect information at the document level. We also present results of a preliminary study in the morphological annotation of Gulf Arabic which includes developing guidelines for a conventional orthography. The text of the corpus is publicly browsable through a web interface we developed for it.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/haad.json b/datasets/haad.json new file mode 100644 index 0000000..3a97d8f --- /dev/null +++ b/datasets/haad.json @@ -0,0 +1,36 @@ +{ + "Name": "HAAD", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/msmadi/HAAD", + "License": "GPL-2.0", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Human Annotated Arabic Dataset of Book Reviews for Aspect Based Sentiment Analysis", + "Volume": "2,389", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "JUST", + "Derived From": "nan", + "Paper Title": "Human Annotated Arabic Dataset of Book Reviews for Aspect Based Sentiment Analysis", + "Paper Link": "https://ieeexplore.ieee.org/document/7300895", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "FiCloud ", + "Citations": "68.0", + "Venue Type": "conference", + "Venue Name": "Conference on Future Internet of Things and Cloud", + "Authors": "Mohammad Al-Smadi,Omar Qawasmeh,Bashar Talafha,Muhannad Quwaider", + "Affiliations": ",,,", + "Abstract": "With the prominent advances in Web interaction and the enormous growth in user-generated content, sentiment analysis has gained more interest in commercial and academic purposes. Recently, sentiment analysis of Arabic user-generated content is increasingly viewed as an important research field. However, the majority of available approaches target the overall polarity of the text. To the best of our knowledge, there is no available research on aspect-based sentiment analysis (ABSA) of Arabic text. This can be explained due to the lack of publically available datasets prepared for ABSA, and to the slow progress in sentiment analysis of Arabic text research in general. This paper fosters the domain of Arabic ABSA, and provides a benchmark human annotated Arabic dataset (HAAD). HAAD consists of books reviews in Arabic which have been annotated by humans with aspect terms and their polarities. Nevertheless, the paper reports a baseline results and a common evaluation technique to facilitate future evaluation of research and methods.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/habibi.json b/datasets/habibi.json new file mode 100644 index 0000000..1965f1f --- /dev/null +++ b/datasets/habibi.json @@ -0,0 +1,73 @@ +{ + "Name": "Habibi ", + "Subsets": [ + { + "Name": "Gulf", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Volume": "9,484", + "Unit": "documents" + }, + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "7,265", + "Unit": "documents" + }, + { + "Name": "Levantine", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "6,016", + "Unit": "documents" + }, + { + "Name": "Iraqi", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "3,438", + "Unit": "documents" + }, + { + "Name": "Sudan", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "2,662", + "Unit": "documents" + }, + { + "Name": "Maghrebi", + "Dialect": "ar-NOR: (Arabic (North Africa))", + "Volume": "1,207", + "Unit": "documents" + } + ], + "HF Link": "https://hf.co/datasets/arbml/Habibi", + "Link": "https://www.lancaster.ac.uk/staff/elhaj/corpora.html", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "The corpus comprises more than 30,000 Arabic song lyrics in 6 Arabic dialects for singers from 18 different Arabic countries.", + "Volume": "30,072", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Lancaster University", + "Derived From": "nan", + "Paper Title": "Habibi - a multi Dialect multi National Arabic Song Lyrics Corpus\r", + "Paper Link": "https://aclanthology.org/2020.lrec-1.165.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "LREC", + "Citations": "7.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Mahmoud El-Haj", + "Affiliations": "nan", + "Abstract": "This paper introduces Habibi the first Arabic Song Lyrics corpus. The corpus comprises more than 30,000 Arabic song lyrics in 6 Arabic dialects for singers from 18 different Arabic countries. The lyrics are segmented into more than 500,000 sentences (song verses) with more than 3.5 million words. I provide the corpus in both comma separated value (csv) and annotated plain text (txt) file formats. In addition, I converted the csv version into JavaScript Object Notation (json) and eXtensible Markup Language (xml) file formats. To experiment with the corpus I run extensive binary and multi-class experiments for dialect and country-of-origin identification. The identification tasks include the use of several classical machine learning and deep learning models utilising different word embeddings. For the binary dialect identification task the best performing classifier achieved a testing accuracy of 93%. This was achieved using a word-based Convolutional Neural Network (CNN) utilising a Continuous Bag of Words (CBOW) word embeddings model. The results overall show all classical and deep learning models to outperform our baseline, which demonstrates the suitability of the corpus for both dialect and country-of-origin identification tasks. I am making the corpus and the trained CBOW word embeddings freely available for research purposes.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/hard.json b/datasets/hard.json new file mode 100644 index 0000000..e4ba1b7 --- /dev/null +++ b/datasets/hard.json @@ -0,0 +1,36 @@ +{ + "Name": "HARD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/bigIR/ar_cov19", + "Link": "https://github.com/elnagara/HARD-Arabic-Dataset", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling", + "Description": "490587 hotel reviews collected from the Booking.com website.", + "Volume": "93,700", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Sharjah University", + "Derived From": "nan", + "Paper Title": "Hotel Arabic-Reviews Dataset Construction for Sentiment Analysis Applications", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, review classification", + "Venue Title": "INLP", + "Citations": "49.0", + "Venue Type": "journal", + "Venue Name": "Intelligent Natural Language Processing: Trends and Applications", + "Authors": "Ashraf Elnagar,Yasmin Khalifa,Anas Einea", + "Affiliations": ",,", + "Abstract": "Arabic language suffers from the lack of available large datasets for machine learning and sentiment analysis applications. This work adds to the recently reported large dataset BRAD, which is the largest Book Reviews in Arabic Dataset. In this paper, we introduce HARD (Hotel Arabic-Reviews Dataset), the largest Book Reviews in Arabic Dataset for subjective sentiment analysis and machine language applications. HARD comprises of 490587 hotel reviews collected from the Booking.com website. Each record contains the review text in the Arabic language, the reviewer\u2019s rating on a scale of 1 to 10 stars, and other attributes about the hotel/reviewer. We make available the full unbalanced dataset as well as a balanced subset. To examine the datasets, we implement six popular classifiers using Modern Standard Arabic (MSA) as well as Dialectal Arabic (DA). We test the sentiment analyzers for polarity and rating classifications. Furthermore, we implement a polarity lexicon-based sentiment analyzer. The findings confirm the effectiveness of the classifiers and the datasets. Our core contribution is to make this benchmark-dataset available and accessible to the research community on Arabic language.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/hc_corpora.json b/datasets/hc_corpora.json new file mode 100644 index 0000000..a380997 --- /dev/null +++ b/datasets/hc_corpora.json @@ -0,0 +1,36 @@ +{ + "Name": "HC Corpora", + "Subsets": [], + "HF Link": "nan", + "Link": "https://web.archive.org/web/20161021044006/http://corpora.heliohost.org/", + "License": "unknown", + "Year": 2016, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "The corpora are collected from publicly available sources by a web crawler. The crawler checks for language, so as to mainly get texts consisting of the desired language", + "Volume": "nan", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/hijja.json b/datasets/hijja.json new file mode 100644 index 0000000..0a147ff --- /dev/null +++ b/datasets/hijja.json @@ -0,0 +1,36 @@ +{ + "Name": "Hijja", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Hijja2", + "Link": "https://github.com/israksu/Hijja2", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "handwriting", + "Form": "images", + "Collection Style": "manual curation", + "Description": "A set of handwritten images for the different characters of Arabic.", + "Volume": "47,434", + "Unit": "images", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Arabic handwriting recognition system using convolutional neural network", + "Paper Link": "https://d-nb.info/1216415676/34", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "optical character recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/hyter_networks_of_selected_openmt08_09_sentences.json b/datasets/hyter_networks_of_selected_openmt08_09_sentences.json new file mode 100644 index 0000000..6ea5bd8 --- /dev/null +++ b/datasets/hyter_networks_of_selected_openmt08_09_sentences.json @@ -0,0 +1,36 @@ +{ + "Name": "HyTER Networks of Selected OpenMT08/09 Sentences", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The source material is comprised of Arabic and Chinese newswire and web data collected by LDC in 2007. Annotators created meaning-equivalent annotations under three annotation protocols. In the first protocol, foreign language native speakers built English networks starting from foreign language sentences. In the second, English native speakers built English networks from the best translation of a foreign language sentence as identified by NIST (National Institute of Standards and Technology). In the third protocol, English native speakers built English networks starting from the best translation, but those annotators also had access to three additional, independently produced human translations. Networks created by different annotators for each sentence were combined and evaluated.", + "Volume": "102", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/idat.json b/datasets/idat.json new file mode 100644 index 0000000..cb8ff47 --- /dev/null +++ b/datasets/idat.json @@ -0,0 +1,36 @@ +{ + "Name": "IDAT", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/multilingual_irony", + "Link": "https://github.com/bilalghanem/multilingual_irony", + "License": "GPL-3.0", + "Year": 2019, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "written in\r\nModern Standard Arabic but also in different Arabic language varieties\r\nincluding Egypt, Gulf, Levantine and Maghrebi dialects", + "Volume": "5,030", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "IDAT@FIRE2019: Overview of the Track on\r\nIrony Detection in Arabic Tweets", + "Paper Link": "http://ceur-ws.org/Vol-2517/T4-1.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "irony detection", + "Venue Title": "FIRE", + "Citations": "32.0", + "Venue Type": "conference", + "Venue Name": "Forum for Information Retrieval Evaluation", + "Authors": "Bilal Ghanem,Jihen Karoui,F. Benamara,V\u00e9ronique Moriceau,P. Rosso", + "Affiliations": ",,,,", + "Abstract": "This overview paper describes the first shared task on irony detection for the Arabic language. The task consists of a binary classification of tweets as ironic or not using a dataset composed of 5, 030 Arabic tweets about different political issues and events related to the Middle East and the Maghreb. Tweets in our dataset are written in Modern Standard Arabic but also in different Arabic language varieties including Egypt, Gulf, Levantine and Maghrebi dialects. Eighteen teams registered to the task among which ten submitted their runs. The methods of participants ranged from feature-based to neural networks using either classical machine learning techniques or ensemble methods. The best performing system achieved F-score value of 0.844, showing that classical feature-based models outperform the neural ones.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/idrisi-r.json b/datasets/idrisi-r.json new file mode 100644 index 0000000..8aaa152 --- /dev/null +++ b/datasets/idrisi-r.json @@ -0,0 +1,61 @@ +{ + "Name": "IDRISI-R", + "Subsets": [ + { + "Name": "Arabic gold", + "Dialect": "mixed", + "Volume": "4,593", + "Unit": "sentences" + }, + { + "Name": "Arabic silver", + "Dialect": "mixed", + "Volume": "1,187,123", + "Unit": "sentences" + }, + { + "Name": "English gold", + "Dialect": "mixed", + "Volume": "20,514", + "Unit": "sentences" + }, + { + "Name": "English silver", + "Dialect": "mixed", + "Volume": "56,682", + "Unit": "sentences" + } + ], + "HF Link": "nan", + "Link": "https://github.com/rsuwaileh/IDRISI", + "License": "custom", + "Year": 2022, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "IDRISI-R is the largest-scale publicly-available Twitter Location Mention Recognition (LMR) dataset, in both English and Arabic languages. It contains 41 disaster events of different types such as floods, fires, etc. In addition to tagging LMs in text, the LMs are labeled for location types such as countries, cities, streets, POIs, etc.", + "Volume": "1,268,912", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Qatar University, QCRI, Hamad Bin Khalifa University", + "Derived From": "Kawarith and humAID datasets", + "Paper Title": "(Under review) IDRISI-R: Large-scale English and Arabic Location Mention Recognition Datasets for Disaster Response over Twitter", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "location mention recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "nan", + "Authors": "Reem Suwaileh, Tamer Elsayed, Muhammad Imran", + "Affiliations": "Computer Science and Engineering Department, College of Engineering, Qatar University, Doha, Qatar. Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU), Doha, Qatar", + "Abstract": "While utilizing Twitter data for crisis management, a critical challenge that hinders authorities' response is the scarcity of geotagged messages. Although studies show the presence of toponyms in tweets and their effectiveness as alternative information to geotagged messages, limited focus has been given to location mention recognition in tweets. In fact, the community lacks a standard dataset to thrive research towards building robust models and solutions. To bridge this gap, we present two human-labeled datasets for the location mention recognition task in text messages, particularly tweets. The human annotation task labels toponym spans and assigns a location type (e.g., country, state, city) to them. The datasets contain tweets from 41 large-scale disaster events (e.g., floods, earthquakes) covering a wide geographical area of English and Arabic-speaking countries. Moreover, we benchmark the datasets using standard and deep learning models and present rigorous quantitative and qualitative analysis to highlight their superiority over past efforts. Last but not least, the trained models are used to process raw data comprising millions of tweets and offered as a silver dataset. ", + "Added By": "Reem Suwaileh" +} \ No newline at end of file diff --git a/datasets/inaracorpus.json b/datasets/inaracorpus.json new file mode 100644 index 0000000..dd73064 --- /dev/null +++ b/datasets/inaracorpus.json @@ -0,0 +1,36 @@ +{ + "Name": "inaracorpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/inaracorpus", + "Link": "https://sourceforge.net/projects/inaracorpus/", + "License": "unknown", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "InAra corpus comprises 1024 documents; 80% of them contain passages borrowed from other documents to simulate plagiarism. For each suspicious document an XML file is associated ; it contains the length and the position of each plagiarism passage.", + "Volume": "1,024", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "Intrinsic plagiarism detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/infopankki_v1.json b/datasets/infopankki_v1.json new file mode 100644 index 0000000..bbac166 --- /dev/null +++ b/datasets/infopankki_v1.json @@ -0,0 +1,36 @@ +{ + "Name": "infopankki v1", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/opus_infopankki", + "Link": "https://opus.nlpl.eu/infopankki-v1.php", + "License": "unknown", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "A parallel corpus of 12 languages, 66 bitexts.", + "Volume": "63,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "Parallel Data, Tools and Interfaces in OPUS", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Jorg Tiedemann", + "Affiliations": "Department of Linguistics and Philology Uppsala University", + "Abstract": "This paper presents the current status of OPUS, a growing language resource of parallel corpora and related tools. The focus in OPUS\nis to provide freely available data sets in various formats together with basic annotation to be useful for applications in computational\nlinguistics, translation studies and cross-linguistic corpus studies. In this paper, we report about new data sets and their features,\nadditional annotation tools and models provided from the website and essential interfaces and on-line services included in the project.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/international_corpus_of_arabic.json b/datasets/international_corpus_of_arabic.json new file mode 100644 index 0000000..c2900cb --- /dev/null +++ b/datasets/international_corpus_of_arabic.json @@ -0,0 +1,36 @@ +{ + "Name": "International Corpus of Arabic", + "Subsets": [], + "HF Link": "nan", + "Link": "http://www.bibalex.org/ica/ar/", + "License": "unknown", + "Year": 2007, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "There are two points of view about the need for corpora. The first one says that there can not be any corpora, however large, that contain information about all of the areas of any language lexicon and grammar of that language. The second point of view is that every corpus, however small, has taught the person facts that could not be imagined finding out about in any other way. ", + "Volume": "nan", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Alexandria University", + "Derived From": "nan", + "Paper Title": "Building an International Corpus of Arabic (ICA): Progress of Compilation Stage", + "Paper Link": "http://www.bibalex.org/isis/UploadedFiles/Publications/Building%20an%20Intl%20corpus%20of%20arabic.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "35.0", + "Venue Type": "conference", + "Venue Name": "7th international conference on language engineering, Cairo, Egypt", + "Authors": " Sameh Alansary, Magdy Nagi, Noha Adly", + "Affiliations": "nan", + "Abstract": "This paper focuses on three axes. The first ax is gives a survey of the importance of corpora in language studies e.g. lexicography, grammar, sem antics, Natural Language Processing and other areas. The second axis demonstrates how the A rabic language lacks textual resources, such as corpora and tools for corpus analysis and t he effected of this lack on the quality of Arabic language applications. There are rarely succ essful trials in compiling Arabic corpora, therefore, the third axis presents the technical de sign of the International Corpus of Arabic (ICA), a newly established representative corpus of Arabic that is intended to cover the Arabic language as being used all over the Arab world. The corpus is planned to support various Arabic studies that depends on authentic data, in a ddition to building Arabic Natural Language Processing Applications.", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/iraqi_arabic_conversational_telephone_speech,_transcripts.json b/datasets/iraqi_arabic_conversational_telephone_speech,_transcripts.json new file mode 100644 index 0000000..5df6f2a --- /dev/null +++ b/datasets/iraqi_arabic_conversational_telephone_speech,_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "Iraqi Arabic Conversational Telephone Speech, Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T16", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "A total of 478 conversation sides from 474 unique speakers are provided, and most of these transcripts contain both sides of a conversation (202 transcripts with both sides and 74 with just one side). The average duration per call is about six minutes, so each call side contains about three minutes of speech, on average.", + "Volume": "50", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "200.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/iraqi_arabic_conversational_telephone_speech.json b/datasets/iraqi_arabic_conversational_telephone_speech.json new file mode 100644 index 0000000..bdc461e --- /dev/null +++ b/datasets/iraqi_arabic_conversational_telephone_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Iraqi Arabic Conversational Telephone Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006S45", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Iraqi Arabic Conversational Telephone Speech was developed by Appen Pty Ltd, Sydney, Australia and contains roughly 3000 mins of speech from Iraqi Arabic speakers taking part in spontaneous telephone conversations in Colloquial Iraqi Arabic.", + "Volume": "50", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/isarcasmeval__semeval-2022_task_6.json b/datasets/isarcasmeval__semeval-2022_task_6.json new file mode 100644 index 0000000..6ae35b8 --- /dev/null +++ b/datasets/isarcasmeval__semeval-2022_task_6.json @@ -0,0 +1,36 @@ +{ + "Name": "iSarcasmEval: SemEval-2022 Task 6", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/iSarcasmEval_task_A", + "Link": "https://github.com/iabufarha/iSarcasmEval", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A Dataset of Intended Sarcasm", + "Volume": "4,447", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "University of Edinburgh", + "Derived From": "nan", + "Paper Title": "iSarcasm: A Dataset of Intended Sarcasm", + "Paper Link": "https://arxiv.org/pdf/1911.03123.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sarcasm detection", + "Venue Title": "SEMEVAL", + "Citations": "17.0", + "Venue Type": "workshop", + "Venue Name": "International Workshop on Semantic Evaluation", + "Authors": "Silviu Oprea, Walid Magdy", + "Affiliations": "University of Edinburgh", + "Abstract": "We consider the distinction between intended and perceived sarcasm in the context of textual sarcasm detection. The former occurs when an utterance is sarcastic from the perspective of its author, while the latter occurs when the utterance is interpreted as sarcastic by the audience. We show the limitations of previous labelling methods in capturing intended sarcasm and introduce the iSarcasm dataset of tweets labeled for sarcasm directly by their authors. Examining the state-of-the-art sarcasm detection models on our dataset showed low performance compared to previously studied datasets, which indicates that these datasets might be biased or obvious and sarcasm could be a phenomenon under-studied computationally thus far. By providing the iSarcasm dataset, we aim to encourage future NLP research to develop methods for detecting sarcasm in text as intended by the authors of the text, not as labeled under assumptions that we demonstrate to be sub-optimal.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/isi_arabic-english_automatically_extracted_parallel_text.json b/datasets/isi_arabic-english_automatically_extracted_parallel_text.json new file mode 100644 index 0000000..d0457f6 --- /dev/null +++ b/datasets/isi_arabic-english_automatically_extracted_parallel_text.json @@ -0,0 +1,36 @@ +{ + "Name": "ISI Arabic-English Automatically Extracted Parallel Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007T08", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The corpus contains 1,124,609 sentence pairs; the word count on the English side is approximately 31M words. The sentences in the parallel corpus preserve the form and encoding of the texts in the original Gigaword corpora.", + "Volume": "1,124,609", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,000.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/jana__a_human-human_dialogues_corpus_for_egyptian_dialect.json b/datasets/jana__a_human-human_dialogues_corpus_for_egyptian_dialect.json new file mode 100644 index 0000000..c37fe17 --- /dev/null +++ b/datasets/jana__a_human-human_dialogues_corpus_for_egyptian_dialect.json @@ -0,0 +1,36 @@ +{ + "Name": "JANA: A Human-Human Dialogues Corpus for Egyptian Dialect", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2016T24", + "License": "LDC User Agreement for Non-Members", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "The transcribed dialogues consist of 52 telephone calls and 30 instant messaging conversations, amounting to approximately 20,311 words. The data contains roughly 3,001 conversation turns, with an average of 6.7 words per turn, and 4,725 utterances, with an average of 4.3 words per utterance. The data was transcribed using Transcriber.", + "Volume": "20,311", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,650.00 $", + "Test Split": "No", + "Tasks": "dialogue generation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/journalists_questions.json b/datasets/journalists_questions.json new file mode 100644 index 0000000..31b7cae --- /dev/null +++ b/datasets/journalists_questions.json @@ -0,0 +1,36 @@ +{ + "Name": "journalists_questions ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/mkqa", + "Link": "http://qufaculty.qu.edu.qa/telsayed/datasets/", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "human translation", + "Description": "crowdsorucing to collect\r\nbinary annotations for 10K of the potential question tweets\r\nbased on whether they truly contain questions or not", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "What Questions Do Journalists Ask on Twitter?", + "Paper Link": "https://www.semanticscholar.org/paper/What-Questions-Do-Journalists-Ask-on-Twitter-Hasanain-Bagdouri/d1b32df7e9f39e6fba912cc209054ae0256638eb", + "Script": "Arab", + "Tokenized": "No", + "Host": "Dropbox", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering ", + "Venue Title": "ICWSM", + "Citations": "4.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Web and Social Media", + "Authors": "Maram Hasanain,Mossaab Bagdouri,T. Elsayed,D. Oard", + "Affiliations": ",,,", + "Abstract": "Social media platforms are a major source of information for both the general public and for journalists. Journalists use Twitter and other social media services to gather story ideas, to find eyewitnesses, and for a wide range of other purposes. One way in which journalists use Twitter is to ask questions. This paper reports on an empirical investigation of questions asked by Arab journalists on Twitter. The analysis begins with the development of an ontology of question types, proceeds to human annotation of training and test data, and concludes by reporting the level of accuracy that can be achieved with automated classification techniques. The results show good classifier effectiveness for high prevalence question types, but that obtaining sufficient training data for lower prevalence question types can be challenging.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/kacst.json b/datasets/kacst.json new file mode 100644 index 0000000..e996e7c --- /dev/null +++ b/datasets/kacst.json @@ -0,0 +1,36 @@ +{ + "Name": "KACST", + "Subsets": [], + "HF Link": "nan", + "Link": "http://www.kacstac.org.sa", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "The KACST Arabic corpus comprises more than 700 million words from the pre-Islamic era to the present day (a period covering more than 1,500 years), collected from 10 diverse mediums.", + "Volume": "7,000,000", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "KACST", + "Derived From": "http://shamela.ws/ http://saaid.net/ http://www.awu.sy/ https://uqu.edu.sa/page/ar/518 http://www.kfu.edu.sa/ar/departments/sjournal/Pages/Home.aspx http://www.boe.gov.sa/MainLaws.aspx?lang=en http://www.arablegalportal.org/ http://www.alwatan.com.sa http://rosa-magazine.com/ http://www.spa.gov.sa/", + "Paper Title": "A 700M+ Arabic corpus: KACST Arabic corpus design and construction", + "Paper Link": "https://link.springer.com/article/10.1007/s10579-014-9284-1", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "natural language inference", + "Venue Title": "LREC", + "Citations": "20.0", + "Venue Type": "journal", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Al-Thubaity, A.O", + "Affiliations": "King Abdulaziz City for Science & Technology (KACST) university", + "Abstract": "Compared with English, Arabic is a poorly-resourced language within the field of corpus linguistics. A lack of sufficient data and research has negatively affected Arabic corpus-based researchers and natural language processing practitioners. Although a number of Arabic corpora have been developed in recent years, the overall situation has improved little. The aim of this paper is twofold. First, it reviews 14 Arabic corpora categorized by their designated purpose, target language, mode of text, size, text date, location, text type/medium, text domain, representativeness, and balance. The review also describes the availability of the reviewed corpora, the presence of tokenization, lemmatization and tagging, and whether there are any tools available to search and explore them. Second, it introduces the King Abdulaziz City for Science and Technology (KACST) Arabic corpus, which was designed and created to overcome the limitations of existing Arabic corpora. The KACST Arabic corpus is a large and diverse Arabic corpus with clearly defined design criteria. It is carefully sampled, and its contents are classified based on time, region, medium, domain, and topic, and it can be searched and explored using these classifications. The KACST Arabic corpus comprises more than 700 million words from the pre-Islamic era to the present day (a period covering more than 1,500 years), collected from 10 diverse mediums. Each text has been further classified more specifically into domains and topics. The KACST Arabic corpus is freely available to explore on the Internet (http://www.kacstac.org.sa) using a variety of tools.", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file diff --git a/datasets/kalamdz.json b/datasets/kalamdz.json new file mode 100644 index 0000000..9c0042b --- /dev/null +++ b/datasets/kalamdz.json @@ -0,0 +1,36 @@ +{ + "Name": "KalamDZ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/KalamDZ", + "Link": "https://github.com/LIM-MoDos/KalamDZ", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "8 major Algerian Arabic sub-dialects with 4881\r\nspeakers and more than 104.4 hours segmented in utterances of at least 6 s", + "Volume": "104", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Laboratoire d\u2019informatique et Math\u00e9matiques\r\nUniversit\u00e9 ", + "Derived From": "nan", + "Paper Title": "Toward a Web-based Speech Corpus for Algerian Arabic Dialectal\r\nVarieties", + "Paper Link": "https://aclanthology.org/W17-1317.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech classification, dialect identification", + "Venue Title": "WANLP", + "Citations": "10.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Soumia Bougrine,Aicha Chorana,Abdallah Lakhdari,H. Cherroun", + "Affiliations": ",,,", + "Abstract": "The success of machine learning for automatic speech processing has raised the need for large scale datasets. However, collecting such data is often a challenging task as it implies significant investment involving time and money cost. In this paper, we devise a recipe for building largescale Speech Corpora by harnessing Web resources namely YouTube, other Social Media, Online Radio and TV. We illustrate our methodology by building KALAM\u2019DZ, An Arabic Spoken corpus dedicated to Algerian dialectal varieties. The preliminary version of our dataset covers all major Algerian dialects. In addition, we make sure that this material takes into account numerous aspects that foster its richness. In fact, we have targeted various speech topics. Some automatic and manual annotations are provided. They gather useful information related to the speakers and sub-dialect information at the utterance level. Our corpus encompasses the 8 major Algerian Arabic sub-dialects with 4881 speakers and more than 104.4 hours segmented in utterances of at least 6 s.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/kalimat.json b/datasets/kalimat.json new file mode 100644 index 0000000..8a0ee60 --- /dev/null +++ b/datasets/kalimat.json @@ -0,0 +1,36 @@ +{ + "Name": "KALIMAT", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/kalimat_articles", + "Link": "https://sourceforge.net/projects/kalimat/", + "License": "custom", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "20,291 Arabic articles collected from the Omani newspaper Alwatan", + "Volume": "20,291", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "KALIMAT a Multipurpose Arabic\r\nCorpus", + "Paper Link": "https://eprints.lancs.ac.uk/id/eprint/71282/1/KALIMAT_ELHAJ_KOULALI.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "topic classification,summarization,named entity recognition,part of speech tagging,morphological analysis", + "Venue Title": "other", + "Citations": "30.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Mahmoud El-Haj,R. Koulali", + "Affiliations": "Lancaster University,", + "Abstract": "Resources, such as corpora, are important for researchers working on Arabic Natural Language Processing (NLP) (Al-Sulaiti et al. 2006). For this reason we came up with the idea of generating an Arabic multipurpose corpus, which we call KALIMAT (Arabic transliteration of \u201cWORDS\u201d). The automatically created corpus could benefit researchers working on different Arabic NLP areas. In our work on Arabic we developed, enhanced and tested many Arabic NLP tools. We tuned these tools to provide high quality results. The tools include auto-summarisers, Part of Speech Tagger, Morphological Analyser and Named Entity Recognition (NER). We ran these tools using the same document collection. We provide the output corpus freely for researchers to evaluate their work and to run experiments for different Arabic NLP purposes using one corpus.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/kawarith.json b/datasets/kawarith.json new file mode 100644 index 0000000..0d22493 --- /dev/null +++ b/datasets/kawarith.json @@ -0,0 +1,36 @@ +{ + "Name": "Kawarith", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/kawarith_Kuwait_floods_18", + "Link": "https://github.com/alaa-a-a/kawarith", + "License": "CC BY-NC 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "a multi-dialect Arabic Twitter corpus for crisis events, comprising more than\r\na million Arabic tweets collected during 22\r\ncrises that occurred between 2018 and 2020\r\nand involved several types of hazard", + "Volume": "12,446", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "University of Birmingham", + "Derived From": "nan", + "Paper Title": "Kawarith: an Arabic Twitter Corpus for Crisis Events \u00af\r", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.5.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "crisis detection", + "Venue Title": "WANLP", + "Citations": "2.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Alaa Alharbi,Mark G. Lee", + "Affiliations": "Taibah University;University of Birmingham,", + "Abstract": "Social media (SM) platforms such as Twitter provide large quantities of real-time data that can be leveraged during mass emergencies. Developing tools to support crisis-affected communities requires available datasets, which often do not exist for low resource languages. This paper introduces Kawarith a multi-dialect Arabic Twitter corpus for crisis events, comprising more than a million Arabic tweets collected during 22 crises that occurred between 2018 and 2020 and involved several types of hazard. Exploration of this content revealed the most discussed topics and information types, and the paper presents a labelled dataset from seven emergency events that serves as a gold standard for several tasks in crisis informatics research. Using annotated data from the same event, a BERT model is fine-tuned to classify tweets into different categories in the multi- label setting. Results show that BERT-based models yield good performance on this task even with small amounts of task-specific training data.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/kde4.json b/datasets/kde4.json new file mode 100644 index 0000000..957ac67 --- /dev/null +++ b/datasets/kde4.json @@ -0,0 +1,36 @@ +{ + "Name": "KDE4", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/kde4", + "Link": "https://opus.nlpl.eu/KDE4.php", + "License": "custom", + "Year": 2012, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "A parallel corpus of KDE4 localization files (v.2). 92 languages, 4,099 bitexts", + "Volume": "700,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "Parallel Data, Tools and Interfaces in OPUS", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Jorg Tiedemann", + "Affiliations": "Department of Linguistics and Philology Uppsala University, Uppsala/Sweden", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/khaleej-2004.json b/datasets/khaleej-2004.json new file mode 100644 index 0000000..a68bb78 --- /dev/null +++ b/datasets/khaleej-2004.json @@ -0,0 +1,36 @@ +{ + "Name": "Khaleej-2004", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/khaleej_2004", + "Link": "https://sourceforge.net/projects/arabiccorpus/files/", + "License": "unknown", + "Year": 2004, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "Extracted from the daily Arabic news paper Akhbar al Khaleej, it includes 5120 news articles corresponding to 2,855,069 words covering four topics sport, local news, international news and economy", + "Volume": "5,690", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "INRIA", + "Derived From": "nan", + "Paper Title": "Comparison of Topic Identification methods for Arabic Language", + "Paper Link": "https://hal.inria.fr/inria-00000448/document", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "M. Abbas and K. Smaili", + "Affiliations": "INRIA-LORIA", + "Abstract": "In this paper we present two well-known methods for topic identification. The first one is a TFIDF classifier approach, and the second one is a based machine learning approach which is called Support Vector Machines (SVM). In our knowledge, we do not know several works on Arabic topic identification. So that we decide to investigate in this article. The corpus we used is extracted from the daily Arabic newspaper it Akhbar Al Khaleej, it includes 5120 news articles corresponding to 2.855.069 words covering four topics : sport, local news, international news and economy. According to our experiments, the results are encouraging both for SVM and TFIDF classifier, however we have noticed the superiority of the SVM classifier and its high capability to distinguish topics.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/khalidalt_tydiqa-goldp.json b/datasets/khalidalt_tydiqa-goldp.json new file mode 100644 index 0000000..08b3dcc --- /dev/null +++ b/datasets/khalidalt_tydiqa-goldp.json @@ -0,0 +1,36 @@ +{ + "Name": "khalidalt/tydiqa-goldp", + "Subsets": [], + "HF Link": "https://hf.co/datasets/khalidalt/tydiqa-goldp", + "Link": "https://hf.co/datasets/khalidalt/tydiqa-goldp", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.", + "Volume": "204,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Google", + "Derived From": "nan", + "Paper Title": "TyDi QA: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages", + "Paper Link": "https://aclanthology.org/2020.tacl-1.30.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "TACL", + "Citations": "178.0", + "Venue Type": "conference", + "Venue Name": "Transactions of the Association for Computational Linguistics", + "Authors": "Jonathan H. Clark, Eunsol Choi, Michael Collins, Dan Garrette, Tom Kwiatkowski, Vitaly Nikolaev, Jennimaria Palomaki", + "Affiliations": "Google", + "Abstract": "Confidently making progress on multilingual modeling requires challenging, trustworthy evaluations. We present TyDi QA\u2014a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs. The languages of TyDi QA are diverse with regard to their typology\u2014the set of linguistic features each language expresses\u2014such that we expect models performing well on this set to generalize across a large number of the world\u2019s languages. We present a quantitative analysis of the data quality and example-level qualitative linguistic analyses of observed language phenomena that would not be found in English-only corpora. To provide a realistic information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but don\u2019t know the answer yet, and the data is collected directly in each language without the use of translation.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/khawas.json b/datasets/khawas.json new file mode 100644 index 0000000..6315d11 --- /dev/null +++ b/datasets/khawas.json @@ -0,0 +1,36 @@ +{ + "Name": "Khawas", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sourceforge.net/projects/kacst-acptool/", + "License": "unknown", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "a corpus containing more than two million words and a corpora processing tool that is specifically designed for Arabic", + "Volume": "2,910", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "New Language Resources for Arabic: Corpus Containing More Than Two Million Words and a Corpus Processing Tool", + "Paper Link": "https://ieeexplore.ieee.org/abstract/document/6646005", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "IALP", + "Citations": "19.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Asian Language Processing", + "Authors": "A. Al-Thubaity,Marwa Khan,Manal Al-Mazrua,Maram Al-Mousa", + "Affiliations": ",,,", + "Abstract": "Arabic is a resource-poor language relative to other languages with a similar number of speakers. This situation negatively affects corpus-based linguistic studies in Arabic and, to a lesser extent, Arabic language processing. This paper presents a brief overview of recent freely available Arabic corpora and corpora processing tools, and it examines some of the issues that may be preventing Arabic linguists from using the same. These issues reveal the need for new language resources to enrich and foster Arabic corpus-based studies. Accordingly, this paper introduces the design of a new Arabic corpus that includes modern standard Arabic varieties based on newspapers from all Arab countries and that comprises more than two million words, it also describes the main features of a corpus processing tool specifically designed for Arabic, called \"Khawas U\u0302\u00e6C\u0327O\u0303\" (\"diver\" in English). Khawas provides more features than any other freely available corpus processing tool for Arabic, including n-gram frequency and concordance, collocations, and statistical comparison of two corpora. Finally, we outline modifications and improvements that could be made in future works.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/kind.json b/datasets/kind.json new file mode 100644 index 0000000..27f7cea --- /dev/null +++ b/datasets/kind.json @@ -0,0 +1,36 @@ +{ + "Name": "KIND", + "Subsets": [], + "HF Link": "https://hf.co/datasets/KIND-Dataset/KIND", + "Link": "https://hf.co/datasets/KIND-Dataset/KIND", + "License": "CC BY 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The KIND dataset consists of fine-grained Arabic dialect data collected through a social collaboration approach, emphasizing community involvement. It includes both aligned parallel corpora (MSA to dialectal translations) and dialectal Q&A data. The dataset covers underrepresented Arabic dialects, is suitable for training NLP models, and supports nuanced dialect analysis and comparison with MSA.", + "Volume": "55,484", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "KFUPM", + "Derived From": "MADAR dataset for MSA sentences ", + "Paper Title": "The KIND Dataset: A Social Collaboration Approach for Nuanced Dialect Data Collection", + "Paper Link": "https://aclanthology.org/2024.eacl-srw.3.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification, machine translation, question answering", + "Venue Title": "EACL", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The 18th Conference of the European Chapter of the Association for Computational Linguistics", + "Authors": "Asma Z. Yamani, Raghad Alziyady, Reem AlYami, Salma A. Albelali, Leina Abouhagar, Jawharah Almulhim, Amjad Alsulami, Motaz Alfarraj, Rabeah Al-Zaidy", + "Affiliations": "King Fahd University of Petroleum & Minerals, Saudi Arabia.", + "Abstract": "The KIND dataset offers a nuanced collection of Arabic dialect data, emphasizing under-represented dialects and cultural relevance. It includes both aligned parallel texts for translation between MSA and dialects, and Q&A data. The dataset was curated through social collaboration and gamification, promoting community participation and offering innovative data collection methods.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/ksaa-rd_dataset.json b/datasets/ksaa-rd_dataset.json new file mode 100644 index 0000000..1c4ecc4 --- /dev/null +++ b/datasets/ksaa-rd_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "KSAA-RD Dataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/StephenETaylor/KSAA-RD", + "License": "unknown", + "Year": 2023, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": " The KSAA-RD dataset contains over 58,000 Arabic entries and 63,000 English entries. Each entry consists of a word (lemma), its part of speech, and a gloss (definition). The dataset provides both contextualized word embeddings (from AraELECTRA) and fixed word embeddings (from AraVec\u2019s skip-gram model).", + "Volume": "58,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "King Salman Global Academy for Arabic Language (KSAA)", + "Derived From": "Contemporary Arabic Language dictionary (Arabic), SemEval 2022 English dictionary", + "Paper Title": "KSAA-RD Shared Task: Arabic Reverse Dictionary", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.39.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "cross-lingual information retrieval, reverse dictionary generation", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "First Arabic Natural Language Processing Conference", + "Authors": "Rawan Al-Matham, Waad Alshammari, Abdulrahman AlOsaimy, Sarah Alhumoud, Asma Al Wazrah, Afrah Altamimi, Halah Alharbi, Abdullah Alfaifi", + "Affiliations": "King Salman Global Academy for Arabic Language (KSAA)", + "Abstract": "The KSAA-RD shared task involves developing a reverse dictionary system for Arabic, with two subtasks focusing on Arabic-to-Arabic and cross-lingual (English-to-Arabic) reverse dictionary tasks. The dataset includes glosses and word embeddings, and the task aimed to improve word embedding models for reverse dictionary generation. Teams participated to predict the most accurate word embeddings from glosses using neural language models like BERT.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/ksu_rich_arabic_speech_database.json b/datasets/ksu_rich_arabic_speech_database.json new file mode 100644 index 0000000..13871c0 --- /dev/null +++ b/datasets/ksu_rich_arabic_speech_database.json @@ -0,0 +1,36 @@ +{ + "Name": "KSU Rich Arabic Speech Database\r", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014S02", + "License": "custom", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "it has 752 speakers; the speakers are from different\r\nethnic groups: Saudis, Arabs, and non-Arabs;", + "Volume": "590", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "King Saud University ", + "Derived From": "nan", + "Paper Title": "KSU Rich Arabic Speech Database\r", + "Paper Link": "https://catalog.ldc.upenn.edu/docs/LDC2014S02/KSU-Rich-Arabic-Speech-Database.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2000 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "other", + "Citations": "20.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "M. Alsulaiman,G. Muhammad,M. Bencherif,A. Mahmood,Z. Ali", + "Affiliations": ",,,,", + "Abstract": "Arabic is one of the major languages in the world. Unfortunately not so much research in Arabic speaker recognition has been done. One main reason for this lack of research is the unavailability of rich Arabic speech databases. In this paper, we present a rich and comprehensive Arabic speech database that we developed for the Arabic speaker / speech recognition research and/or applications. The database is rich in different aspects: (a) it has 752 speakers; (b) the speakers are from different ethnic groups: Saudis, Arabs, and non-Arabs; (c) utterances are both read text and spontaneous; (d) scripts are of different dimensions, such as, isolated words, digits, phonetically rich words, sentences, phonetically balanced sentences, paragraphs, etc.; (e) different sets of microphones with medium and high quality; (f) telephony and non-telephony speech; (g) three different recording environments: office, sound proof room, and cafeteria; (h) three different sessions, where the recording sessions are scheduled at least with 2 weeks interval. Because of the richness of this database, it can be used in many Arabic, and non-Arabic, speech processing researches, such as speaker / speech recognition, speech analysis, accent identification, ethnic groups / nationality recognition, etc. The richness of the database makes it a valuable resource for research in Arabic speech processing in particular and for research in speech processing in general. The database was carefully manually verified. The manual verification was complemented with automatic verification. Validation was performed on a subset of the database where the recognition rate reached 100% for Saudi speakers and 96% for non-Saudi speakers by using a system with 12 Mel frequency Cepstral coefficients, and 32 Gaussian mixtures.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ksucca_corpus.json b/datasets/ksucca_corpus.json new file mode 100644 index 0000000..76e727c --- /dev/null +++ b/datasets/ksucca_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "KSUCCA Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/KSUCCA", + "Link": "https://sourceforge.net/projects/ksucca-corpus/", + "License": "unknown", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling", + "Description": "KSUCCA is a pioneering 50+ million word corpus that captures the culture of a nation.", + "Volume": "410", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "King Saud University", + "Derived From": "nan", + "Paper Title": "The Design and Construction of the 50 Million Words KSUCCA ", + "Paper Link": "https://eprints.whiterose.ac.uk/81860/1/TheDesignConstruction50MillionWordKSUCCA.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "semantic similarity", + "Venue Title": "other", + "Citations": "30.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Maha AlRabiah,A. Al-Salman,E. Atwell", + "Affiliations": ",,", + "Abstract": "In this paper, we report the design and construction of King Saud University Corpus of Classical Arabic (KSUCCA), which is part of ongoing research that attempts to study the meanings of words used in the holy Quran, through analysis of their distributional semantics in contemporaneous texts. The holy Quranic text was revealed in pure Classical Arabic, which forms the basis of Arabic linguistic theory and which is well understood by the educated Arabic reader. Therefore, it is necessary to investigate the distributional lexical semantics of the Quran's words in the light of similar texts (corpus) that are written in pure Classical Arabic. To the best of our knowledge, there exist only two corpora of Classical Arabic; one is part of the King Abdulaziz City for Science and Technology Arabic Corpus (KACST Arabic Corpus) and the other is the Classical Arabic Corpus (CAC) (Elewa, 2009). However, neither of the two corpora is adequate for our research; the former does not cover many genres such as: Linguistics, Literature, Science, Sociology and Biography; and it only contains 17+ million words, so it is not very large. While the latter is even smaller with only 5 million words. Therefore, we made an effort to carefully design and compose our own corpus bearing in mind that it should be large enough, balanced, and representative so that any result obtained from it can be generalized for Classical Arabic. In addition, we tried to make the design general enough in order to make the corpus also appropriate for other research.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/ksuemotions.json b/datasets/ksuemotions.json new file mode 100644 index 0000000..2692990 --- /dev/null +++ b/datasets/ksuemotions.json @@ -0,0 +1,36 @@ +{ + "Name": "KSUEmotions", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017S12", + "License": "LDC User Agreement for Non-Members", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Audio was recorded in each participant's home. Audio is presented as 16-bit 16 kHz flac compressed wav. In addition to speech files and metadata about the speakers, timeless label files and automatic time segmentation alignment files are included. Text is presented as UTF-8 plain text.", + "Volume": "5", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "prosody,speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/kunuz.json b/datasets/kunuz.json new file mode 100644 index 0000000..6ec195d --- /dev/null +++ b/datasets/kunuz.json @@ -0,0 +1,36 @@ +{ + "Name": "KUNUZ", + "Subsets": [], + "HF Link": "nan", + "Link": "http://jarir.tn/kunuzcorpus", + "License": "CC BY-NC-ND 4.0", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "KUNUZ is an XMLized version of Sahih Albukhari, the most authentic hadith book.", + "Volume": "7,563", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Joint group for Artificial Reasoning and Information Retrieval", + "Derived From": "nan", + "Paper Title": "KUNUZ: a Multi-purpose Reusable Test Collection for Classical Arabic Document Engineering", + "Paper Link": "https://ieeexplore.ieee.org/document/9035212", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, cross-lingual information retrieval, named entity recognition, information retrieval, language identification, document classification, information extraction", + "Venue Title": "AICCSA", + "Citations": "2.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer Systems and Applications", + "Authors": "Ibrahim Bounhas, Souheila Ben Guirat", + "Affiliations": "LISI: Laboratory of Computer science for industrial systems, Carthage University, Tunisia", + "Abstract": "Corpora are important resources for several applications in Information Retrieval (IR) and Knowledge Extraction (KE). Arabic is a low resourced language characterized by its complex morphology. Furthermore, most existent Arabic language resources focus on Modern Standard Arabic (MSA). This paper describes KUNUZ a multi-purpose test collection composed of voweled and structured classical Arabic documents. Its goal is to provide a unique benchmark for assessing applications in several areas of document engineering including IR, document classification and information extraction. The documents are also translated in English to allow Arabic-English cross-lingual IR and machine translation. As far as IR is concerned, we follow the standard topic development and results sampling used in international campaigns. The paper, describes the process of topic development, results pooling and relevance judgment. It also analyses the results of some processing tools and IR models used in the runs. In order to enhance the results of our experiments, we also proposed to combine the results based on a meta-search approach using Support Vector Machines (SVM) classification.", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file diff --git a/datasets/l-hsab.json b/datasets/l-hsab.json new file mode 100644 index 0000000..0a21e43 --- /dev/null +++ b/datasets/l-hsab.json @@ -0,0 +1,36 @@ +{ + "Name": "L-HSAB", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/L_HSAB", + "Link": "https://github.com/Hala-Mulki/L-HSAB-First-Arabic-Levantine-HateSpeech-Dataset", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arabic Levantine Hate Speech and Abusive Language Dataset", + "Volume": "5,851", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "L-HSAB: A Levantine Twitter Dataset for Hate Speech and Abusive Language", + "Paper Link": "https://aclanthology.org/W19-3512.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "hate speech detection, abusive language detection", + "Venue Title": "ALW", + "Citations": "46.0", + "Venue Type": "workshop", + "Venue Name": "Abusive Language Online", + "Authors": "Hala Mulki,Hatem Haddad,Chedi Bechikh Ali,Halima Alshabani", + "Affiliations": ",iCompass,,", + "Abstract": "\u2217Department of Computer Engineering, Konya Technical University, Turkey \u2020RIADI Laboratory, National School of Computer Sciences, University of Manouba, Tunisia \u2217\u2217LISI Laboratory, INSAT, Carthage University, Tunisia \u2217\u2217\u2217Department of Computer Engineering, K\u0131r\u0131kkale University, Turkey \u00a7iCompass Consulting, Tunisia halamulki@selcuk.edu.tr,haddad.Hatem@gmail.com chedi.bechikh@gmail.com,halima.alshabani@gmail.com Abstract", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/labr.json b/datasets/labr.json new file mode 100644 index 0000000..9561df9 --- /dev/null +++ b/datasets/labr.json @@ -0,0 +1,36 @@ +{ + "Name": "LABR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/mohamedadaly/labr", + "Link": "https://github.com/mohamedadaly/LABR", + "License": "GPL-2.0", + "Year": 2013, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The largest sentiment analysis dataset to-date for the Arabic language.", + "Volume": "63,257", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Cairo University", + "Derived From": "nan", + "Paper Title": "LABR: A Large Scale Arabic Book Reviews Dataset", + "Paper Link": "https://aclanthology.org/P13-2088.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "ACL", + "Citations": "165.0", + "Venue Type": "conference", + "Venue Name": "Associations of computation linguistics", + "Authors": "Mohamed A. Aly,A. Atiya", + "Affiliations": ",", + "Abstract": "We introduce LABR, the largest sentiment analysis dataset to-date for the Arabic language. It consists of over 63,000 book reviews, each rated on a scale of 1 to 5 stars. We investigate the properties of the the dataset, and present its statistics. We explore using the dataset for two tasks: sentiment polarity classification and rating classification. We provide standard splits of the dataset into training and testing, for both polarity and rating classification, in both balanced and unbalanced settings. We run baseline experiments on the dataset to establish a benchmark.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/lama.json b/datasets/lama.json new file mode 100644 index 0000000..76677e9 --- /dev/null +++ b/datasets/lama.json @@ -0,0 +1,36 @@ +{ + "Name": "LAMA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ara_emotion", + "Link": "https://github.com/UBC-NLP/ara_emotion_naacl2018", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "A dataset for Modern Standard and Dialectal Arabic emotion detection focused at Robert Plutchik\u2019s 8 basic emotion types", + "Volume": "7,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "University of British Columbia", + "Derived From": "nan", + "Paper Title": "Enabling Deep Learning of Emotion With First-Person Seed Expressions", + "Paper Link": "https://aclanthology.org/W18-1104.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "emotion detection", + "Venue Title": "ACL", + "Citations": "21.0", + "Venue Type": "workshop", + "Venue Name": "he Second Workshop on Computational Modeling of People\u2019s Opinions, Personality, and Emotions in Social Media, ACL", + "Authors": "Hassan Alhuzali, Muhammad Abdul-Mageed, Lyle Ungar", + "Affiliations": "nan", + "Abstract": "The computational treatment of emotion in natural language text remains relatively lim- ited, and Arabic is no exception. This is partly due to lack of labeled data. In this work, we describe and manually validate a method for the automatic acquisition of emotion labeled data and introduce a newly developed data set for Modern Standard and Dialectal Arabic emotion detection focused at Robert Plutchik\u2019s 8 basic emotion types. Using a hybrid supervi- sion method that exploits first person emotion seeds, we show how we can acquire promis- ing results with a deep gated recurrent neu- ral network. Our best model reaches 70% F- score, significantly (i.e., 11%, p < 0.05) out- performing a competitive baseline. Applying our method and data on an external dataset of 4 emotions released around the same time we fi- nalized our work, we acquire 7% absolute gain in F-score over a linear SVM classifier trained on gold data, thus validating our approach", + "Added By": "Emad A. Alghamdi" +} \ No newline at end of file diff --git a/datasets/language_identification.json b/datasets/language_identification.json new file mode 100644 index 0000000..71837ea --- /dev/null +++ b/datasets/language_identification.json @@ -0,0 +1,36 @@ +{ + "Name": "language identification", + "Subsets": [], + "HF Link": "https://hf.co/datasets/papluca/language-identification", + "Link": "https://hf.co/datasets/papluca/language-identification", + "License": "unknown", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The Language Identification dataset is a collection of 90k samples consisting of text passages and corresponding language label. ", + "Volume": "3,500", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "XNLI", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alayfeai" +} \ No newline at end of file diff --git a/datasets/language_understanding_annotation_corpus.json b/datasets/language_understanding_annotation_corpus.json new file mode 100644 index 0000000..01949d3 --- /dev/null +++ b/datasets/language_understanding_annotation_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Language Understanding Annotation Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T10", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "The resulting corpus contains over 9000 words of English text (6949 words) and Arabic text (2183 words) annotated for committed belief, event and entity coreference, dialog acts and temporal relations.", + "Volume": "2,183", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "pragmatics", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json b/datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json new file mode 100644 index 0000000..4009e28 --- /dev/null +++ b/datasets/large_multi-domain_resources_for_arabic_sentiment_analysis.json @@ -0,0 +1,36 @@ +{ + "Name": "Large Multi-Domain Resources for Arabic Sentiment Analysis ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ATT", + "Link": "https://github.com/hadyelsahar/large-arabic-sentiment-analysis-resouces", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Large Multi-Domain Resources for Arabic Sentiment Analysis", + "Volume": "45,498", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Nile University", + "Derived From": "nan", + "Paper Title": "Building Large Arabic Multi-domain Resources for Sentiment Analysis", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "CICLing", + "Citations": "127.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computational Linguistics and Intelligent Text Processing", + "Authors": "Hady ElSahar,S. El-Beltagy", + "Affiliations": ",", + "Abstract": "While there has been a recent progress in the area of Arabic Sentiment Analysis, most of the resources in this area are either of limited size, domain specific or not publicly available. In this paper, we address this problem by generating large multi-domain datasets for Sentiment Analysis in Arabic. The datasets were scrapped from different reviewing websites and consist of a total of 33K annotated reviews for movies, hotels, restaurants and products. Moreover we build multi-domain lexicons from the generated datasets. Different experiments have been carried out to validate the usefulness of the datasets and the generated lexicons for the task of sentiment classification. From the experimental results, we highlight some useful insights addressing: the best performing classifiers and feature representation methods, the effect of introducing lexicon based features and factors affecting the accuracy of sentiment classification in general. All the datasets, experiments code and results have been made publicly available for scientific purposes.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/laser.json b/datasets/laser.json new file mode 100644 index 0000000..7225a2c --- /dev/null +++ b/datasets/laser.json @@ -0,0 +1,36 @@ +{ + "Name": "LASER", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/facebookresearch/LASER", + "License": "BSD", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "aligned sentences in 112 languages", + "Volume": "1,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "Tatoeba", + "Paper Title": "Massively Multilingual Sentence Embeddings for Zero-Shot\r\nCross-Lingual Transfer and Beyond", + "Paper Link": "https://arxiv.org/pdf/1812.10464.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation ", + "Venue Title": "TACL", + "Citations": "374.0", + "Venue Type": "journal", + "Venue Name": "Transactions of the Association for Computational Linguistics", + "Authors": "Mikel Artetxe,Holger Schwenk", + "Affiliations": ",", + "Abstract": "Abstract We introduce an architecture to learn joint multilingual sentence representations for 93 languages, belonging to more than 30 different families and written in 28 different scripts. Our system uses a single BiLSTM encoder with a shared byte-pair encoding vocabulary for all languages, which is coupled with an auxiliary decoder and trained on publicly available parallel corpora. This enables us to learn a classifier on top of the resulting embeddings using English annotated data only, and transfer it to any of the 93 languages without any modification. Our experiments in cross-lingual natural language inference (XNLI data set), cross-lingual document classification (MLDoc data set), and parallel corpus mining (BUCC data set) show the effectiveness of our approach. We also introduce a new test set of aligned sentences in 112 languages, and show that our sentence embeddings obtain strong results in multilingual similarity search even for low- resource languages. Our implementation, the pre-trained encoder, and the multilingual test set are available at https://github.com/facebookresearch/LASER.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/lc-star__standard_arabic_phonetic_lexicon.json b/datasets/lc-star__standard_arabic_phonetic_lexicon.json new file mode 100644 index 0000000..12c2f66 --- /dev/null +++ b/datasets/lc-star__standard_arabic_phonetic_lexicon.json @@ -0,0 +1,36 @@ +{ + "Name": "LC-STAR: Standard Arabic Phonetic lexicon", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0247/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2007, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The lexicon comprises 110,271 entries, distributed over three categories: - a set of 52,981 common word entries. This set is extracted from a corpus of more than 13 million words distributed over 6 different domains (sports/games, news, finance, culture/entertainment, consumer information, personal communications). This was done with the aim of reaching a target for each domain of at least 95% self coverage. In addition to extracting word lists from the corpus, a list of closed set (function) word classes are included in the final word list. - a set of 50,135 proper names (including person names, family names, cities, streets, companies and brand names) divided into 3 domains. Multiple word names such as New_York are kept together in all three domains, and they count as one entry. The 3 domains consist of first and last names (9,738 different entries), place names (22,998 different entries), and organisations (17,309 different entries). - and a list of 7,155 special application words translated from English terms defined by the LC-STAR consortium. This list contains: numbers, letters, abbreviations and specific vocabulary for applications controlled by voice (information retrieval, controlling of consumer devices, etc.).", + "Volume": "110,271", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "European Commission", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "27,625.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation, speech recognition, lexicon analysis ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ldc_standard_arabic_morphological_analyzer_(sama)_version_3_1.json b/datasets/ldc_standard_arabic_morphological_analyzer_(sama)_version_3_1.json new file mode 100644 index 0000000..079b88a --- /dev/null +++ b/datasets/ldc_standard_arabic_morphological_analyzer_(sama)_version_3_1.json @@ -0,0 +1,36 @@ +{ + "Name": "LDC Standard Arabic Morphological Analyzer (SAMA) Version 3.1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010L01", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The input format, output format, and data layer of SAMA 3.1 were designed to be backward compatible with BAMA. Incremental changes to the data layer in SAMA have resulted in:", + "Volume": "40,654", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "N/A $", + "Test Split": "No", + "Tasks": "machine translation,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/le_monde_diplomatique__arabic_tagged_corpus.json b/datasets/le_monde_diplomatique__arabic_tagged_corpus.json new file mode 100644 index 0000000..e6eadaa --- /dev/null +++ b/datasets/le_monde_diplomatique__arabic_tagged_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Le Monde Diplomatique: Arabic tagged corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0049/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2009, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This corpus contains 102,960 vowelized, lemmatized and tagged words (", + "Volume": "102,960", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "400.00\u20ac", + "Test Split": "No", + "Tasks": "grammatical analysis, morphological analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/lebanon_uprising_arabic_tweets.json b/datasets/lebanon_uprising_arabic_tweets.json new file mode 100644 index 0000000..a4ebd65 --- /dev/null +++ b/datasets/lebanon_uprising_arabic_tweets.json @@ -0,0 +1,36 @@ +{ + "Name": "Lebanon Uprising Arabic Tweets", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Lebanon_Uprising_Arabic_Tweets", + "Link": "https://www.kaggle.com/datasets/abedkhooli/lebanon-uprising-october-2019-tweets", + "License": "unknown", + "Year": 2019, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is a collection of tweets related to the Arabic hashtag (#\u0644\u0628\u0646\u0627\u0646_\u064a\u0646\u062a\u0641\u0636) on Lebanon uprising in October 2019.", + "Volume": "100,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "ABED KHOOLI ", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/let-mi.json b/datasets/let-mi.json new file mode 100644 index 0000000..d200954 --- /dev/null +++ b/datasets/let-mi.json @@ -0,0 +1,36 @@ +{ + "Name": "Let-mi", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/bilalghanem/let-mi ", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Levantine Twitter dataset for Misogynistic language", + "Volume": "6,603", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "Let-Mi: An Arabic Levantine Twitter Dataset for Misogynistic Language\r", + "Paper Link": "https://arxiv.org/pdf/2103.10195.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "misogyny identification", + "Venue Title": "WANLP", + "Citations": "2.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Hala Mulki,Bilal Ghanem", + "Affiliations": ",", + "Abstract": "Online misogyny has become an increasing worry for Arab women who experience gender-based online abuse on a daily basis. Misogyny automatic detection systems can assist in the prohibition of anti-women Arabic toxic content. Developing such systems is hindered by the lack of the Arabic misogyny benchmark datasets. In this paper, we introduce an Arabic Levantine Twitter dataset for Misogynistic language (LeT-Mi) to be the first benchmark dataset for Arabic misogyny. We further provide a detailed review of the dataset creation and annotation phases. The consistency of the annotations for the proposed dataset was emphasized through inter-rater agreement evaluation measures. Moreover, Let-Mi was used as an evaluation dataset through binary/multi-/target classification tasks conducted by several state-of-the-art machine learning systems along with Multi-Task Learning (MTL) configuration. The obtained results indicated that the performances achieved by the used systems are consistent with state-of-the-art results for languages other than Arabic, while employing MTL improved the performance of the misogyny/target classification tasks.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/levantine_arabic_conversational_telephone_speech,_transcripts.json b/datasets/levantine_arabic_conversational_telephone_speech,_transcripts.json new file mode 100644 index 0000000..83edbb3 --- /dev/null +++ b/datasets/levantine_arabic_conversational_telephone_speech,_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "Levantine Arabic Conversational Telephone Speech, Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007T01", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "This database contains 982 Levantine Arabic speakers taking part in spontaneous telephone conversations in Colloquial Levantine Arabic. A total of 985 conversation sides are provided (there are three speakers who each appear in two disctinct conversations). The average duration per side is between 5 and 6 minutes.", + "Volume": "985", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "Appen Pty Ltd", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "200.00 $", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/levantine_arabic_conversational_telephone_speech.json b/datasets/levantine_arabic_conversational_telephone_speech.json new file mode 100644 index 0000000..743497f --- /dev/null +++ b/datasets/levantine_arabic_conversational_telephone_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Levantine Arabic Conversational Telephone Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007S01", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "For an example of the data into this corpus, please listen to this audio sample (wav format). ", + "Volume": "985", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "400.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/levantine_arabic_qt_training_data_set_4_(speech_+_transcripts).json b/datasets/levantine_arabic_qt_training_data_set_4_(speech_+_transcripts).json new file mode 100644 index 0000000..b21e58c --- /dev/null +++ b/datasets/levantine_arabic_qt_training_data_set_4_(speech_+_transcripts).json @@ -0,0 +1,36 @@ +{ + "Name": "Levantine Arabic QT Training Data Set 4 (Speech + Transcripts)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005S14", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Levantine Arabic QT Training Data Set 4 (Speech + Transcripts) was developed by the Linguistic Data Consortium (LDC) and contains approximately 138 hours of conversational telephone speech in Levantine Arabic and the associated transcripts.", + "Volume": "138", + "Unit": "hours", + "Ethical Risks": "nan", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000$", + "Test Split": "No", + "Tasks": "speech recognition, text to speech", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mohamed Maamouri, Tim Buckwalter, Hubert Jin", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/levantine_arabic_qt_training_data_set_5,_speech.json b/datasets/levantine_arabic_qt_training_data_set_5,_speech.json new file mode 100644 index 0000000..9b0c2a4 --- /dev/null +++ b/datasets/levantine_arabic_qt_training_data_set_5,_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Levantine Arabic QT Training Data Set 5, Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006S29", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Levantine Arabic QT Training Data Set 5, Speech was developed by the Linguistic Data Consortium (LDC) and contains 1,660 calls totalling approximately 250 hours of telephone conversation in Levantine Arabic. These calls were collected between 2003 and 2005.", + "Volume": "250", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "4,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/levantine_arabic_qt_training_data_set_5,_transcripts.json b/datasets/levantine_arabic_qt_training_data_set_5,_transcripts.json new file mode 100644 index 0000000..98b9b40 --- /dev/null +++ b/datasets/levantine_arabic_qt_training_data_set_5,_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "Levantine Arabic QT Training Data Set 5, Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T07", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "This corpus is the combination of four former training data sets: LDC2004E21 and LDC2004E22, LDC2004E65 and LDC2004E66, Arabic CTS Levantine Fisher Training Data Set 3, Speech (LDC2005S07) and Arabic CTS Levantine Fisher Training Data Set 3, Transcripts (LDC2005T03), and Levantine Arabic QT Training Data Set 4 (Speech + Transcripts) (LDC2005S14). More than half of the speakers are Lebanese, the others are Jordanian, Palestinian, and Syrian. The table below shows the distribution of the speakers' national origin:", + "Volume": "250", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "language modeling,language teaching,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/lince_-_msa-da__(lid_-_code_switching_).json b/datasets/lince_-_msa-da__(lid_-_code_switching_).json new file mode 100644 index 0000000..ba78d8d --- /dev/null +++ b/datasets/lince_-_msa-da__(lid_-_code_switching_).json @@ -0,0 +1,36 @@ +{ + "Name": "LinCE - MSA-DA (LID - Code Switching )", + "Subsets": [], + "HF Link": "nan", + "Link": "https://ritual.uh.edu/lince/datasets", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Twitter data and 9 entity types to establish a new dataset for code-switched NER\r\nbenchmarks.", + "Volume": "11,241", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "Overview for the Second Shared Task on Language Identification in Code-Switched Data", + "Paper Link": "https://aclanthology.org/W16-5805.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "codeswitch detection", + "Venue Title": "CALCS ", + "Citations": "98.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Computational Approaches to Code Switching", + "Authors": "Giovanni Molina,F. Alghamdi,Mahmoud A. Ghoneim,A. Hawwari,Nicolas Rey-Villamizar,Mona T. Diab,T. Solorio", + "Affiliations": ",,,,,,", + "Abstract": "We present an overview of the first shared task on language identification on codeswitched data. The shared task included code-switched data from four language pairs: Modern Standard ArabicDialectal Arabic (MSA-DA), MandarinEnglish (MAN-EN), Nepali-English (NEPEN), and Spanish-English (SPA-EN). A total of seven teams participated in the task and submitted 42 system runs. The evaluation showed that language identification at the token level is more difficult when the languages present are closely related, as in the case of MSA-DA, where the prediction performance was the lowest among all language pairs. In contrast, the language pairs with the higest F-measure where SPA-EN and NEP-EN. The task made evident that language identification in code-switched data is still far from solved and warrants further research.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/lince_-_msa-egy_(ner_-_code_switching).json b/datasets/lince_-_msa-egy_(ner_-_code_switching).json new file mode 100644 index 0000000..3fb4b36 --- /dev/null +++ b/datasets/lince_-_msa-egy_(ner_-_code_switching).json @@ -0,0 +1,36 @@ +{ + "Name": "LinCE - MSA-EGY (NER - Code Switching)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://ritual.uh.edu/lince/datasets", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Modern Standard ArabicDialectal Arabic (MSA-DA)", + "Volume": "11,224", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "Named Entity Recognition on Code-Switched Data: Overview of the CALCS 2018 Shared Task", + "Paper Link": "https://aclanthology.org/W18-3219.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "named entity recognition", + "Venue Title": "CALCS ", + "Citations": "39.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Computational Approaches to Code Switching", + "Authors": "Gustavo Aguilar,F. Alghamdi,V\u00edctor Soto,Mona T. Diab,Julia Hirschberg,T. Solorio", + "Affiliations": "University of Houston,,,,,", + "Abstract": "In the third shared task of the Computational Approaches to Linguistic Code-Switching (CALCS) workshop, we focus on Named Entity Recognition (NER) on code-switched social-media data. We divide the shared task into two competitions based on the English-Spanish (ENG-SPA) and Modern Standard Arabic-Egyptian (MSA-EGY) language pairs. We use Twitter data and 9 entity types to establish a new dataset for code-switched NER benchmarks. In addition to the CS phenomenon, the diversity of the entities and the social media challenges make the task considerably hard to process. As a result, the best scores of the competitions are 63.76% and 71.61% for ENG-SPA and MSA-EGY, respectively. We present the scores of 9 participants and discuss the most common challenges among submissions.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/lisan.json b/datasets/lisan.json new file mode 100644 index 0000000..9caa663 --- /dev/null +++ b/datasets/lisan.json @@ -0,0 +1,61 @@ +{ + "Name": "Lisan", + "Subsets": [ + { + "Name": "Iraqi", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "50,000", + "Unit": "tokens" + }, + { + "Name": "Libyan", + "Dialect": "ar-LY: (Arabic (Libya))", + "Volume": "52,000", + "Unit": "tokens" + }, + { + "Name": "Sudanese", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "53,000", + "Unit": "tokens" + }, + { + "Name": "Yemeni", + "Dialect": "ar-YE: (Arabic (Yemen))", + "Volume": "1,050,000", + "Unit": "tokens" + } + ], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/currasat/about-en.html", + "License": "CC BY 4.0", + "Year": 2023, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A morphologically-annotated Yemeni, Sudanese, Iraqi, and Libyan Arabic dialects Lisan corpora. Lisan features around 1.2 million tokens. It was collected the content of the corpora from several social media platforms.", + "Volume": "1,200,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Lisan: Yemeni, Iraqi, Libyan, and Sudanese Arabic Dialect Corpora with Morphological Annotations", + "Paper Link": "https://arxiv.org/abs/2212.06468", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, dialect identification, Tokenization, part of speech tagging, morphological analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mustafa Jarrar, Fadi Zaraket, Tymaa Hammouda, Daanish Masood, Martin Waehlisch", + "Affiliations": "nan", + "Abstract": "This article presents morphologically-annotated Yemeni, Sudanese, Iraqi, and Libyan Arabic dialects Lisan corpora. Lisan features around 1.2 million tokens. We collected the content of the corpora from several social media platforms. The Yemeni corpus (~ 1.05M tokens) was collected automatically from Twitter. The corpora of the other three dialects (~ 50K tokens each) came manually from Facebook and YouTube posts and comments.\nThirty five (35) annotators who are native speakers of the target dialects carried out the annotations. The annotators segemented all words in the four corpora into prefixes, stems and suffixes and labeled each with different morphological features such as part of speech, lemma, and a gloss in English. An Arabic Dialect Annotation Toolkit ADAT was developped for the purpose of the annation. The annotators were trained on a set of guidelines and on how to use ADAT. We developed ADAT to assist the annotators and to ensure compatibility with SAMA and Curras tagsets. The tool is open source, and the four corpora are also available online.", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/lk-hadith-corpus.json b/datasets/lk-hadith-corpus.json new file mode 100644 index 0000000..58d89a5 --- /dev/null +++ b/datasets/lk-hadith-corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "LK-Hadith-Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/LK_Hadith", + "Link": "https://github.com/ShathaTm/LK-Hadith-Corpus", + "License": "unknown", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "other", + "Description": "a bilingual parallel corpus of Islamic Hadith", + "Volume": "39,038", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "King Saud University", + "Derived From": "nan", + "Paper Title": "The Arabic\u2013English Parallel Corpus of Authentic Hadith", + "Paper Link": "https://www.researchgate.net/publication/341359917_The_Arabic-English_Parallel_Corpus_of_Authentic_Hadith", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "IJASAT", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "International Journal on Islamic Applications in Computer Science And Technologies", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "We present a bilingual parallel corpus of Islamic Hadith, which is the set of narratives reporting different aspects of the prophet Muhammad's life. The Hadith collection is extracted from the six canonical Hadith books which possess unique linguistic features and patterns that are automatically extracted and annotated using a domain-specific tool for Hadith segmentation. In this article, we present the methodology of creating the corpus of 39,038 annotated Hadiths which will be freely available for the research community. ", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/madar.json b/datasets/madar.json new file mode 100644 index 0000000..7b19fc9 --- /dev/null +++ b/datasets/madar.json @@ -0,0 +1,36 @@ +{ + "Name": "MADAR", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/madar-parallel-corpus/", + "License": "custom", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "a collection of parallel sentences covering the dialects of 25 cities from the Arab World", + "Volume": "14,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "The MADAR Arabic Dialect Corpus and Lexicon\r", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2018/pdf/351.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "LREC", + "Citations": "85.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Houda Bouamor,Nizar Habash,Mohammad Salameh,W. Zaghouani,Owen Rambow,D. Abdulrahim,Ossama Obeid,Salam Khalifa,Fadhl Eryani,Alexander Erdmann,Kemal Oflazer", + "Affiliations": ",,,,,,,New York University Abu Dhabi,,,", + "Abstract": "In this paper, we present two resources that were created as part of the Multi Arabic Dialect Applications and Resources (MADAR) project. The first is a large parallel corpus of 25 Arabic city dialects in the travel domain. The second is a lexicon of 1,045 concepts with an average of 45 words from 25 cities per concept. These resources are the first of their kind in terms of the breadth of their coverage and the fine location granularity. The focus on cities, as opposed to regions in studying Arabic dialects, opens new avenues to many areas of research from dialectology to dialect identification and machine translation.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/madar_lexicon.json b/datasets/madar_lexicon.json new file mode 100644 index 0000000..420a9a2 --- /dev/null +++ b/datasets/madar_lexicon.json @@ -0,0 +1,36 @@ +{ + "Name": "MADAR Lexicon", + "Subsets": [], + "HF Link": "nan", + "Link": "https://docs.google.com/forms/d/e/1FAIpQLSe2LHYmHsxdkHPYHgcZDz25dTNbnygPkmClIaLd_fwud-XnTQ/viewform", + "License": "custom", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The MADAR Lexicon is a collection of 1,042 concepts expressed in 25 city dialects totaling 47K entries (with an average of 45 words per concept, or about 2 words per dialect). Concepts were selected from the BTEC Parallel corpora. The lexicon is centered around concept keys, which are triplets of English, French, and Modern Standard Arabic (MSA), and annotators had to provide words that overlap in word sense with all three languages. Each dialectal word is presented in its CODA orthography and its CAPHI phonology (Bouamor et al., 2018; Habash et al., 2018). The MADAR Lexicon was created as part of the Multi-Arabic Dialect Applications and Resources Project (funded by NPRP 7-290- 1-047 from the Qatar National Research Fund (a member of the Qatar Foundation). Website: http://madar.camel-lab.com", + "Volume": "47,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "The MADAR Arabic Dialect Corpus and Lexicon.", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2018/pdf/351.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification, transliteration", + "Venue Title": "LREC", + "Citations": "127.0", + "Venue Type": "conference", + "Venue Name": "The International Conference on Language Resources and Evaluation", + "Authors": "Bouamor, Houda, Nizar Habash, Mohammad Salameh, Wajdi Zaghouani, Owen Rambow, Dana Abdulrahim, Ossama Obeid, Salam Khalifa, Fadhl Eryani, Alexander Erdmann and Kemal Oflazer.", + "Affiliations": "Carnegie Mellon University in Qatar, Qatar; Hamad Bin Khalifa University, Qatar; New York University Abu Dhabi, UAE; Columbia University, USA, University of Bahrain; Bahrain", + "Abstract": "In this paper, we present two resources that were created as part of the Multi Arabic Dialect Applications and Resources (MADAR) project. The first is a large parallel corpus of 25 Arabic city dialects in the travel domain. The second is a lexicon of 1,045 concepts with an average of 45 words from 25 cities per concept. These resources are the first of their kind in terms of the breadth of their coverage and the fine location granularity. The focus on cities, as opposed to regions in studying Arabic dialects, opens new avenues to many areas of research from dialectology to dialect identification and machine translation.", + "Added By": "Fadhl Al-Eryani" +} \ No newline at end of file diff --git a/datasets/madar_twitter_corpus.json b/datasets/madar_twitter_corpus.json new file mode 100644 index 0000000..0205b17 --- /dev/null +++ b/datasets/madar_twitter_corpus.json @@ -0,0 +1,163 @@ +{ + "Name": "MADAR Twitter Corpus", + "Subsets": [ + { + "Name": "Saudi Arabia", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "1,070", + "Unit": "sentences" + }, + { + "Name": "Kuwait", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Volume": "213", + "Unit": "sentences" + }, + { + "Name": "Egypt", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "173", + "Unit": "sentences" + }, + { + "Name": "UAE", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "152", + "Unit": "sentences" + }, + { + "Name": "Oman", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "138", + "Unit": "sentences" + }, + { + "Name": "Yemen", + "Dialect": "ar-YE: (Arabic (Yemen))", + "Volume": "136", + "Unit": "sentences" + }, + { + "Name": "Qatar", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "126", + "Unit": "sentences" + }, + { + "Name": "Bahrain", + "Dialect": "ar-BH: (Arabic (Bahrain))", + "Volume": "113", + "Unit": "sentences" + }, + { + "Name": "Jordan", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "107", + "Unit": "sentences" + }, + { + "Name": "Sudan", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "100", + "Unit": "sentences" + }, + { + "Name": "Iraq", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "99", + "Unit": "sentences" + }, + { + "Name": "Algeria", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "92", + "Unit": "sentences" + }, + { + "Name": "Libya", + "Dialect": "ar-LY: (Arabic (Libya))", + "Volume": "87", + "Unit": "sentences" + }, + { + "Name": "Palestine", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "74", + "Unit": "sentences" + }, + { + "Name": "Lebanon", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "66", + "Unit": "sentences" + }, + { + "Name": "Somalia", + "Dialect": "ar-SO: (Arabic (Somalia))", + "Volume": "60", + "Unit": "sentences" + }, + { + "Name": "Tunisia", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "51", + "Unit": "sentences" + }, + { + "Name": "Syria", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "48", + "Unit": "sentences" + }, + { + "Name": "Morocco", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "45", + "Unit": "sentences" + }, + { + "Name": "Mauritania", + "Dialect": "ar-MR: (Arabic (Mauritania))", + "Volume": "37", + "Unit": "sentences" + }, + { + "Name": "Djibouti", + "Dialect": "ar-DJ: (Arabic (Djibouti))", + "Volume": "2", + "Unit": "sentences" + } + ], + "HF Link": "nan", + "Link": "https://github.com/CAMeL-Lab/CAMeLBERT", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "A large-scale collection of parallel sentences built to cover the dialects of 25 cities from the Arab World ", + "Volume": "2,980", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "Fine-Grained Arabic Dialect Identification", + "Paper Link": "https://aclanthology.org/C18-1113.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation, dialect identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference on Computational Linguistics", + "Authors": "Mohammad Salameh, Houda Bouamor, Nizar Habash", + "Affiliations": "Carnegie Mellon University in Qatar;Carnegie Mellon University in Qatar;New York University Abu Dhabi", + "Abstract": "Previous work on the problem of Arabic Dialect Identification typically targeted coarse-grained\r\nfive dialect classes plus Standard Arabic (6-way classification). This paper presents the first\r\nresults on a fine-grained dialect classification task covering 25 specific cities from across the Arab\r\nWorld, in addition to Standard Arabic \u2013 a very challenging task. We build several classification\r\nsystems and explore a large space of features. Our results show that we can identify the exact\r\ncity of a speaker at an accuracy of 67.9% for sentences with an average length of 7 words (a 9%\r\nrelative error reduction over the state-of-the-art technique for Arabic dialect identification) and\r\nreach more than 90% when we consider 16 words. We also report on additional insights from a\r\ndata analysis of similarity and difference across Arabic dialects", + "Added By": "Raed Alharbi" +} \ No newline at end of file diff --git a/datasets/madcat_phase_1_training_set.json b/datasets/madcat_phase_1_training_set.json new file mode 100644 index 0000000..d8cf59f --- /dev/null +++ b/datasets/madcat_phase_1_training_set.json @@ -0,0 +1,36 @@ +{ + "Name": "MADCAT Phase 1 Training Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2012T15", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This release includes 9,693 annotation files in MADCAT XML format (.madcat.xml) along with their corresponding scanned image files in TIFF format.", + "Volume": "9,693", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "machine translation,handwriting recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/madcat_phase_2_training_set.json b/datasets/madcat_phase_2_training_set.json new file mode 100644 index 0000000..07b607a --- /dev/null +++ b/datasets/madcat_phase_2_training_set.json @@ -0,0 +1,36 @@ +{ + "Name": "MADCAT Phase 2 Training Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This release includes 27,814 annotation files in both GEDI XML and MADCAT XML formats (gedi.xml and madcat.xml) along with their corresponding scanned image files in TIFF format. The annotation results in GEDI XML output files include ground truth annotations and source transcripts.", + "Volume": "27,814", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,500.00 $", + "Test Split": "No", + "Tasks": "handwriting recognition,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/madcat_phase_3_training_set.json b/datasets/madcat_phase_3_training_set.json new file mode 100644 index 0000000..e7af56c --- /dev/null +++ b/datasets/madcat_phase_3_training_set.json @@ -0,0 +1,36 @@ +{ + "Name": "MADCAT Phase 3 Training Set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T15", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This release includes 4,540 annotation files in both GEDI XML and MADCAT XML formats (gedi.xml and madcat.xml) along with their corresponding scanned image files in TIFF format. The annotation results in GEDI XML files include ground truth annotations and source transcripts.", + "Volume": "4,540", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "handwriting recognition,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/maknuune.json b/datasets/maknuune.json new file mode 100644 index 0000000..aaf95ee --- /dev/null +++ b/datasets/maknuune.json @@ -0,0 +1,36 @@ +{ + "Name": "Maknuune", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Maknuune", + "Link": "http://www.palestine-lexicon.org/", + "License": "CC BY-SA 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "Palestinian Arabic lexicon collected through manual curation and field surveys.", + "Volume": "36,302", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Shahd Dibas, NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "Maknuune: A Large Open Palestinian Arabic Lexicon", + "Paper Link": "https://arxiv.org/pdf/2210.12985.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "lexicon analysis", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Shahd Dibas, Christian Khairallah, Nizar Habash, Omar Fayez Sadi, Tariq Sairafy, Karmel Sarabta, Abrar Ardah", + "Affiliations": "NYUAD, University of Oxford, UNRWA", + "Abstract": "We present Maknuune, a large open lexicon for the Palestinian Arabic dialect. Maknuune has over 36K entries from 17K lemmas, and 3.7K roots. All entries include diacritized Arabic orthography, phonological transcription and English glosses. Some entries are enriched with additional information such as broken plurals and templatic feminine forms, associated phrases and collocations, Standard Arabic glosses, and examples or notes on grammar, usage, or location of collected entry.", + "Added By": "Christian Khairallah" +} \ No newline at end of file diff --git a/datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json b/datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json new file mode 100644 index 0000000..49e323c --- /dev/null +++ b/datasets/marsa__multi-domain_arabic_resources_for_sentiment_analysis.json @@ -0,0 +1,36 @@ +{ + "Name": "MARSA: Multi-Domain Arabic Resources for Sentiment Analysis", + "Subsets": [], + "HF Link": "nan", + "Link": "mailto:Sohumoud@imamu.edu.sa", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-GLF: (Arabic (Gulf))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "MARSA\u2014the largest sentiment annotated corpus for Dialectal Arabic (DA) in the Gulf region, which consists of 61,353 manually labeled tweets that contain a total of 840 K tokens. The tweets were collected from trending hashtags in four domains: political, social, sports, and technology to create a multi-domain corpus.", + "Volume": "61,353", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "MARSA: Multi-Domain Arabic Resources for Sentiment Analysis", + "Paper Link": "https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9576756", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Areeb Alowisheq,Nora Al-Twairesh,Mawaheb Altuwaijri,Afnan Almoammar,Alhanouf Alsuwailem,Tarfa Albuhairi,Wejdan Alahaideb,Sarah Alhumoud", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Aljoharah R" +} \ No newline at end of file diff --git a/datasets/marsum__moroccan_articles_summarisation.json b/datasets/marsum__moroccan_articles_summarisation.json new file mode 100644 index 0000000..848d451 --- /dev/null +++ b/datasets/marsum__moroccan_articles_summarisation.json @@ -0,0 +1,36 @@ +{ + "Name": "MArSUM: Moroccan Articles Summarisation", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/MArSum", + "Link": "https://github.com/KamelGaanoun/MoroccanSummarization", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "MArSUM is the first open corpus destinated for Moroccan dialect text summarization. The articles are retrieved from the GOUD.ma website and filtered to retain only Moroccan dialect. We have compiled a corpus of almost 20k articles with their titles.", + "Volume": "20,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "INSEA-Morocco (Institut Nationale de Statistiques et d'Economie Appliqu\u00e9e)", + "Derived From": "nan", + "Paper Title": "Automatic Text Summarization for Moroccan Arabic Dialect Using an Artificial Intelligence Approach", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-031-06458-6_13", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "summarization", + "Venue Title": "CBI'22", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference of Business Intelligence", + "Authors": "Kamel Gaanoun, Abdou Mohamed Naira, Anass Allak, Imade Benelallam", + "Affiliations": "INSEA, AIOX Labs", + "Abstract": "A major advantage of artificial intelligence is its ability to automatically perform tasks at a human-like level quickly; this is needed in many fields, and more particularly in Automatic Text Summarization (ATS). Several advances related to this technique were made in recent years for both extractive and abstractive approaches, notably with the advent of sequence-to-sequence (seq2seq) and Transformers-based models. In spite of this, the Arabic language is largely less represented in this field, due to its complexity and a lack of datasets for ATS. Although some ATS works exist for Modern Standard Arabic (MSA), there is a lack of ATS works for the Arabic dialects that are more prevalent on social networking platforms and the Internet in general. Intending to take an initial step toward meeting this need, we present the first work of ATS concerning the Moroccan dialect known as Darija. This paper introduces the first dataset intended for the summarization of articles written in Darija. In addition, we present state-of-the-art results based on the ROUGE metric for extractive methods based on BERT embeddings and K-MEANS clustering, as well as abstractive methods based on Transformers models.", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/masc.json b/datasets/masc.json new file mode 100644 index 0000000..58a1244 --- /dev/null +++ b/datasets/masc.json @@ -0,0 +1,36 @@ +{ + "Name": "MASC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/abdusahmbzuai/masc_dev", + "Link": "https://github.com/almoslmi/masc", + "License": "custom", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Multi-domain Arabic Sentiment Corpus (MASC) with a size of 8860 positive and negative reviews from different domains", + "Volume": "8,860", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Arabic senti-lexicon: Constructing publicly available language resources for Arabic sentiment analysis", + "Paper Link": "https://journals.sagepub.com/doi/full/10.1177/0165551516683908", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, dialect identification, topic classification", + "Venue Title": "JIS", + "Citations": "54.0", + "Venue Type": "journal", + "Venue Name": "Journal of Information Science", + "Authors": "Tareq Al-Moslmi,M. Albared,Adel Al-Shabi,N. Omar,S. Abdullah", + "Affiliations": ",,,,", + "Abstract": "Sentiment analysis is held to be one of the highly dynamic recent research fields in Natural Language Processing, facilitated by the quickly growing volume of Web opinion data. Most of the approaches in this field are focused on English due to the lack of sentiment resources in other languages such as the Arabic language and its large variety of dialects. In most sentiment analysis applications, good sentiment resources play a critical role. Based on that, in this article, several publicly available sentiment analysis resources for Arabic are introduced. This article introduces the Arabic senti-lexicon, a list of 3880 positive and negative synsets annotated with their part of speech, polarity scores, dialects synsets and inflected forms. This article also presents a Multi-domain Arabic Sentiment Corpus (MASC) with a size of 8860 positive and negative reviews from different domains. In this article, an in-depth study has been conducted on five types of feature sets for exploiting effective features and investigating their effect on performance of Arabic sentiment analysis. The aim is to assess the quality of the developed language resources and to integrate different feature sets and classification algorithms to synthesise a more accurate sentiment analysis method. The Arabic senti-lexicon is used for generating feature vectors. Five well-known machine learning algorithms: na\u00efve Bayes, k-nearest neighbours, support vector machines (SVMs), logistic linear regression and neural network are employed as base-classifiers for each of the feature sets. A wide range of comparative experiments on standard Arabic data sets were conducted, discussion is presented and conclusions are drawn. The experimental results show that the Arabic senti-lexicon is a very useful resource for Arabic sentiment analysis. Moreover, results show that classifiers which are trained on feature vectors derived from the corpus using the Arabic sentiment lexicon are more accurate than classifiers trained using the raw corpus.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/masc__massive_arabic_speech_corpus.json b/datasets/masc__massive_arabic_speech_corpus.json new file mode 100644 index 0000000..b7d761a --- /dev/null +++ b/datasets/masc__massive_arabic_speech_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "MASC: Massive Arabic Speech Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/pain/MASC", + "Link": "https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "This corpus is a dataset that contains 1,000 hours of speech sampled at 16~kHz and crawled from over 700 YouTube channels.", + "Volume": "1,000", + "Unit": "hours", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mohammad Al-Fetyani,Muhammad Al-Barham,Gheith Abandah,Adham Alsharkawi, Maha Dawas\n", + "Affiliations": "nan", + "Abstract": "This paper releases and describes the creation of the Massive Arabic Speech Corpus (MASC). This corpus is a dataset that contains 1,000 hours of speech sampled at 16~kHz and crawled from over 700 YouTube channels. MASC is multi-regional, multi-genre, and multi-dialect dataset that is intended to advance the research and development of Arabic speech technology with the special emphasis on Arabic speech recognition. In addition to MASC, a pre-trained 3-gram language model and a pre-trained automatic speech recognition model are also developed and made available for interested researches. For a better language model, a new and unified Arabic speech corpus is required, and thus, a dataset of 12~M unique Arabic words is created and released. To make practical and convenient use of MASC, the whole dataset is stratified based on dialect into clean and noisy portions. Each of the two portions is then stratified and divided into three subsets: development, test, and training sets. The best word error rate achieved by the speech recognition model is 19.8% for the clean development set and 21.8% for the clean test set.", + "Added By": "Mohammad Amjad Al-Fetyani" +} \ No newline at end of file diff --git a/datasets/masked_arab_states_dataset_(masd).json b/datasets/masked_arab_states_dataset_(masd).json new file mode 100644 index 0000000..16a4b75 --- /dev/null +++ b/datasets/masked_arab_states_dataset_(masd).json @@ -0,0 +1,36 @@ +{ + "Name": "Masked Arab States Dataset (MASD)", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/MASD", + "Link": "https://github.com/SaiedAlshahrani/performance-implications/tree/main/Language-Modeling-Evals/MASD", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "MASD is a masked prompts dataset created using 20 Arab States with their corresponding capital cities, nationalities, currencies, and on which continents they are located, consisting of four categories: country-capital prompts, country-currency prompts, country-nationality prompts, and country-continent prompts. Each prompts category has 40 masked prompts, and the total number of masked prompts in the MASD dataset is 160. This dataset is used to evaluate these Arabic Masked Language Models (MLMs).", + "Volume": "160", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Clarkson University", + "Derived From": "nan", + "Paper Title": "Performance Implications of Using Unrepresentative Corpora in Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.19.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "ArabicNLP 2023", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The First Arabic Natural Language Processing Conference", + "Authors": "Saied Alshahrani, Norah Alshahrani, Soumyabrata Dey, Jeanna Matthews", + "Affiliations": "Clarkson University", + "Abstract": "Wikipedia articles are a widely used source of training data for Natural Language Processing (NLP) research, particularly as corpora for low-resource languages like Arabic. However, it is essential to understand the extent to which these corpora reflect the representative contributions of native speakers, especially when many entries in a given language are directly translated from other languages or automatically generated through automated mechanisms. In this paper, we study the performance implications of using inorganic corpora that are not representative of native speakers and are generated through automated techniques such as bot generation or automated template-based translation. The case of the Arabic Wikipedia editions gives a unique case study of this since the Moroccan Arabic Wikipedia edition (ARY) is small but representative, the Egyptian Arabic Wikipedia edition (ARZ) is large but unrepresentative, and the Modern Standard Arabic Wikipedia edition (AR) is both large and more representative. We intrinsically evaluate the performance of two main NLP upstream tasks, namely word representation and language modeling, using word analogy evaluations and fill-mask evaluations using our two newly created datasets: Arab States Analogy Dataset (ASAD) and Masked Arab States Dataset (MASD). We demonstrate that for good NLP performance, we need both large and organic corpora; neither alone is sufficient. We show that producing large corpora through automated means can be a counter-productive, producing models that both perform worse and lack cultural richness and meaningful representation of the Arabic language and its native speakers.", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/mawqif.json b/datasets/mawqif.json new file mode 100644 index 0000000..7107c2a --- /dev/null +++ b/datasets/mawqif.json @@ -0,0 +1,36 @@ +{ + "Name": "Mawqif", + "Subsets": [], + "HF Link": "https://hf.co/datasets/NoraAlt/Mawqif_Stance-Detection", + "Link": "https://github.com/NoraAlt/Mawqif-Arabic-Stance", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Mawqif is the first Arabic dataset that can be used for target-specific stance detection. This is a multi-label dataset where each data point is annotated for stance, sentiment, and sarcasm.", + "Volume": "4,121", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Mawqif: A Multi-label Arabic Dataset for Target-specific Stance Detection", + "Paper Link": "https://aclanthology.org/2022.wanlp-1.16/", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis, topic classification, irony detection, stance detection", + "Venue Title": "WANLP", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Social media platforms are becoming inherent parts of people\u2019s daily life to express opinions and stances toward topics of varying polarities. Stance detection determines the viewpoint expressed in a text toward a target. While communication on social media (e.g., Twitter) takes place in more than 40 languages, the majority of stance detection research has been focused on English. Although some efforts have recently been made to develop stance detection datasets in other languages, no similar efforts seem to have considered the Arabic language. In this paper, we present Mawqif, the first Arabic dataset for target-specific stance detection, composed of 4,121 tweets annotated with stance, sentiment, and sarcasm polarities. Mawqif, as a multi-label dataset, can provide more opportunities for studying the interaction between different opinion dimensions and evaluating a multi-task model. We provide a detailed description of the dataset, present an analysis of the produced annotation, and evaluate four BERT-based models on it. Our best model achieves a macro-F1 of 78.89%, which shows that there is ample room for improvement on this challenging task. We publicly release our dataset, the annotation guidelines, and the code of the experiments.", + "Added By": "Nora Saleh Alturayeif" +} \ No newline at end of file diff --git a/datasets/mc4.json b/datasets/mc4.json new file mode 100644 index 0000000..149e335 --- /dev/null +++ b/datasets/mc4.json @@ -0,0 +1,36 @@ +{ + "Name": "mC4", + "Subsets": [], + "HF Link": "https://hf.co/datasets/legacy-datasets/mc4", + "Link": "https://www.tensorflow.org/datasets/catalog/c4#c4multilingual_nights_stay", + "License": "CC BY 4.0", + "Year": 2019, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "A colossal, cleaned version of Common Crawl's web crawl corpus.", + "Volume": "53,256,040", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Google", + "Derived From": "C4", + "Paper Title": "A colossal, cleaned version of Common Crawl's web crawl corpus.", + "Paper Link": "https://arxiv.org/pdf/1910.10683.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "text generation, language modeling", + "Venue Title": "JMLR", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Journal of Machine Learning Research", + "Authors": "Colin Raffel, Noam Shazeer, Adam Roberts, Ktherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu", + "Affiliations": "Goolge; Goolge; Goolge; Goolge; Goolge; Goolge; Goolge; ", + "Abstract": "Transfer learning, where a model is first pre-trained on a data-rich task before being finetuned on a downstream task, has emerged as a powerful technique in natural language\nprocessing (NLP). The effectiveness of transfer learning has given rise to a diversity of\napproaches, methodology, and practice. In this paper, we explore the landscape of transfer\nlearning techniques for NLP by introducing a unified framework that converts all text-based\nlanguage problems into a text-to-text format. Our systematic study compares pre-training\nobjectives, architectures, unlabeled data sets, transfer approaches, and other factors on\ndozens of language understanding tasks. By combining the insights from our exploration\nwith scale and our new \u201cColossal Clean Crawled Corpus\u201d, we achieve state-of-the-art results\non many benchmarks covering summarization, question answering, text classification, and\nmore. To facilitate future work on transfer learning for NLP, we release our data set,\npre-trained models, and code", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mcwc.json b/datasets/mcwc.json new file mode 100644 index 0000000..fb1e7b6 --- /dev/null +++ b/datasets/mcwc.json @@ -0,0 +1,36 @@ +{ + "Name": "MCWC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/ezzini/MCWCv1", + "Link": "https://hf.co/datasets/ezzini/MCWCv1", + "License": "MIT License", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "The \u201cMultilingual Corpus of World\u2019s Constitutions\u201d (MCWC) is a rich resource available in English, Arabic, and Spanish, encompassing constitutions from various nations. This corpus serves as a vital asset for the NLP community, facilitating advanced research in constitutional analysis, machine translation, and cross-lingual legal studies.", + "Volume": "236,156", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "The Multilingual Corpus of World\u2019s Constitutions (MCWC)", + "Paper Link": "https://aclanthology.org/2024.osact-1.7.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "OSACT", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Mo El-Haj and Saad Ezzini", + "Affiliations": "nan", + "Abstract": "The \u201cMultilingual Corpus of World\u2019s Constitutions\u201d (MCWC) is a rich resource available in English, Arabic, and\nSpanish, encompassing constitutions from various nations. This corpus serves as a vital asset for the NLP\ncommunity, facilitating advanced research in constitutional analysis, machine translation, and cross-lingual legal\nstudies. To ensure comprehensive coverage, for constitutions not originally available in Arabic and Spanish,\nwe employed a fine-tuned state-of-the-art machine translation model. MCWC prepares its data to ensure high\nquality and minimal noise, while also providing valuable mappings of constitutions to their respective countries\nand continents, facilitating comparative analysis. Notably, the corpus offers pairwise sentence alignments across\nlanguages, supporting machine translation experiments. We utilise a leading Machine Translation model, fine-tuned\non the MCWC to achieve accurate and context-aware translations. Additionally, we introduce an independent\nMachine Translation model as a comparative baseline. Fine-tuning the model on MCWC improves accuracy,\nhighlighting the significance of such a legal corpus for NLP and Machine Translation. MCWC\u2019s diverse multilingual\ncontent and commitment to data quality contribute to advancements in legal text analysis within the NLP community,\nfacilitating exploration of constitutional texts and multilingual data analysis.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mediaspeech.json b/datasets/mediaspeech.json new file mode 100644 index 0000000..eeba818 --- /dev/null +++ b/datasets/mediaspeech.json @@ -0,0 +1,36 @@ +{ + "Name": "MediaSpeech ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/MediaSpeech_ar", + "Link": "https://github.com/NTRLab/MediaSpeech", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "an open-source 10-hour ASR system evaluation\r\ndataset NTR MediaSpeech for 4 languages: Spanish, French,\r\nTurkish and Arabic", + "Volume": "10", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "MediaSpeech: Multilanguage ASR Benchmark and Dataset \r", + "Paper Link": "https://arxiv.org/ftp/arxiv/papers/2103/2103.16193.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "ArXiv", + "Citations": "0.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Rostislav Kolobov,Olga Okhapkina,Olga Omelchishina,A. Platunov,Roman Bedyakin,Vyacheslav Moshkin,D. Men'shikov,N. Mikhaylovskiy", + "Affiliations": ",,,,,,,", + "Abstract": "The performance of automated speech recognition (ASR) systems is well known to differ for varied application domains. At the same time, vendors and research groups typically report ASR quality results either for limited use simplistic domains (audiobooks, TED talks), or proprietary datasets. To fill this gap, we provide an open-source 10-hour ASR system evaluation dataset NTR MediaSpeech for 4 languages: Spanish, French, Turkish and Arabic. The dataset was collected from the official youtube channels of media in the respective languages, and manually transcribed. We estimate that the WER of the dataset is under 5%. We have benchmarked many ASR systems available both commercially and freely, and provide the benchmark results. We also open-source baseline QuartzNet models for each language.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/medical_corpus.json b/datasets/medical_corpus.json new file mode 100644 index 0000000..e0cd204 --- /dev/null +++ b/datasets/medical_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Medical Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/MedicalCorpus", + "Link": "https://github.com/licvol/Arabic-Spoken-Language-Understanding/tree/master/MedicalCorpus", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": " corpus from a medical care forum known as Doctissimo", + "Volume": "152", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "An Arabic Multi-Domain Spoken Language Understanding System\r", + "Paper Link": "https://aclanthology.org/W19-7407.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "disease identification", + "Venue Title": "ICNLSP", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Natural Language and Speech Processing", + "Authors": "Mohamed Lichouri,Mourad Abbas,R. Djeradi,A. Djeradi", + "Affiliations": "CRSTDLA,,,", + "Abstract": "In this paper, we suggest the generalization of an Arabic Spoken Language Understanding (SLU) system in a multi-domain humanmachine dialog. We are interested particularly in domain portability of SLU system related to both structured (DBMS) and unstructured data (Information Extraction), related to four domains. In this work, we used the thematic approach for four domains which are School Management, Medical Diagnostics, Consultation domain and Question-Answering domain (DAWQAS). We should note that two kinds of classifiers are used in our experiments: statistical and neural, namely: Gaussian Naive Bayes, Bernoulli Naive Bayes, Logistic Regression, SGD, Passive Aggressive Classifier, Perceptron, Linear Support Vector and Convolutional Neural Network.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mega-cov.json b/datasets/mega-cov.json new file mode 100644 index 0000000..80a4636 --- /dev/null +++ b/datasets/mega-cov.json @@ -0,0 +1,36 @@ +{ + "Name": "Mega-COV", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/echen102/COVID-19-TweetIDs", + "License": "CC BY-NC-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " A Billion-Scale Dataset of 100+ Languages for COVID-19", + "Volume": "45,000,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "University of British Columbia", + "Derived From": "nan", + "Paper Title": "Mega-COV: A Billion-Scale Dataset of 100+ Languages for COVID-19", + "Paper Link": "https://arxiv.org/pdf/2005.06012.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "COVID relevance, COVID misinformation detection", + "Venue Title": "EACL", + "Citations": "10.0", + "Venue Type": "conference", + "Venue Name": "European Chapter of the Association for Computational Linguistics", + "Authors": "Muhammad Abdul-Mageed,AbdelRahim Elmadany,Dinesh Pabbi,Kunal Verma,Rannie Lin", + "Affiliations": ",University of British Columbia,,The University of British Columbia,", + "Abstract": "We describe Mega-COV, a billion-scale dataset from Twitter for studying COVID-19. The dataset is diverse (covers 268 countries), longitudinal (goes as back as 2007), multilingual (comes in 100+ languages), and has a significant number of location-tagged tweets (~169M tweets). We release tweet IDs from the dataset. We also develop two powerful models, one for identifying whether or not a tweet is related to the pandemic (best F1=97%) and another for detecting misinformation about COVID-19 (best F1=92%). A human annotation study reveals the utility of our models on a subset of Mega-COV. Our data and models can be useful for studying a wide host of phenomena related to the pandemic. Mega-COV and our models are publicly available.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/merged_arabic_corpus_of_isolated_words.json b/datasets/merged_arabic_corpus_of_isolated_words.json new file mode 100644 index 0000000..5de3fd6 --- /dev/null +++ b/datasets/merged_arabic_corpus_of_isolated_words.json @@ -0,0 +1,36 @@ +{ + "Name": "Merged Arabic Corpus of Isolated Words", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Merged_Arabic_Corpus_of_Isolated_Words", + "Link": "https://www.kaggle.com/datasets/mohamedanwarvic/merged-arabic-corpus-of-isolated-words", + "License": "ODbL-1.0", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "This dataset is a voice-recorded dataset of 50 Native-Arabic speakers saying 20 words about 10 times. It has been recorded with a 44100 Hz sampling rate and 16-bit resolution. This dataset can be used for tasks like Speaker Recognition, Speaker Verification, Voice biometrics,", + "Volume": "9,992", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "The Arabic Speech Corpus for Isolated Words", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speaker recognition, speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/metrec.json b/datasets/metrec.json new file mode 100644 index 0000000..6055387 --- /dev/null +++ b/datasets/metrec.json @@ -0,0 +1,36 @@ +{ + "Name": "MetRec", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Zaid/metrec", + "Link": "https://github.com/zaidalyafeai/MetRec", + "License": "MIT License", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "More than 40K of verses with their meters", + "Volume": "47,124", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "kfupm", + "Derived From": "nan", + "Paper Title": "MetRec: A dataset for meter classification of arabic poetry", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S2352340920313792", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "meter classification", + "Venue Title": "Data in brief", + "Citations": "1.0", + "Venue Type": "journal", + "Venue Name": "Data in brief", + "Authors": "Maged S. Al-shaibani,Zaid Alyafeai,Irfan Ahmad", + "Affiliations": ",,", + "Abstract": "In this data article, we report a dataset related to the research titled \u201cMeter Classification of Arabic Poems Using Deep Bidirectional Recurrent Neural Networks\u201d[2]. The dataset was collected from a large repository of Arabic poems, Aldiwan website [1]. The data collection was done using a Python script that scrapes the website to find the poems and their associated meters. The dataset contains the verses and their corresponding meter classes. Meter classes are represented as numbers from 0 to 13. The dataset can be highly useful for further research in order to improve the field of Arabic poems\u2019 meter classification.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/mfqa.json b/datasets/mfqa.json new file mode 100644 index 0000000..c1dedf6 --- /dev/null +++ b/datasets/mfqa.json @@ -0,0 +1,36 @@ +{ + "Name": "MFQA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/clips/mqa", + "Link": "https://hf.co/datasets/clips/mqa", + "License": "CC0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "MQA is a Multilingual corpus of Questions and Answers (MQA) parsed from the Common Crawl. Questions are divided in two types: Frequently Asked Questions (FAQ) and Community Question Answering (CQA).", + "Volume": "3,017,456", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Antwerp", + "Derived From": "Common Crawl", + "Paper Title": "MFAQ: a Multilingual FAQ Dataset", + "Paper Link": "https://arxiv.org/pdf/2109.12870.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "frequently asked questions, question answering", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Maxime De Bruyn, Ehsan Lotfi, Jeska Buhmann, Walter Daelemans", + "Affiliations": "CLiPS Research Center University of Antwerp", + "Abstract": "In this paper, we present the first multilingual\nFAQ dataset publicly available. We collected\naround 6M FAQ pairs from the web, in 21 different languages. Although this is significantly\nlarger than existing FAQ retrieval datasets, it\ncomes with its own challenges: duplication of\ncontent and uneven distribution of topics. We\nadopt a similar setup as Dense Passage Retrieval (DPR) (Karpukhin et al., 2020) and test\nvarious bi-encoders on this dataset. Our experiments reveal that a multilingual model based\non XLM-RoBERTa (Conneau et al., 2019)\nachieves the best results, except for English.\nLower resources languages seem to learn from\none another as a multilingual model achieves a\nhigher MRR than language-specific ones. Our\nqualitative analysis reveals the brittleness of\nthe model on simple word changes. We publicly release our dataset1\n, model2\nand training\nscript", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mgb-2.json b/datasets/mgb-2.json new file mode 100644 index 0000000..bcf8660 --- /dev/null +++ b/datasets/mgb-2.json @@ -0,0 +1,36 @@ +{ + "Name": "MGB-2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://arabicspeech.org/mgb2/", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": " from Aljazeera TV programs have been manually captioned with no timing information", + "Volume": "1,200", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "SPEECH RECOGNITION CHALLENGE IN THE WILD: ARABIC MGB-3\r", + "Paper Link": "https://arxiv.org/pdf/1709.07276.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "ASRU", + "Citations": "64.0", + "Venue Type": "workshop", + "Venue Name": "IEEE Automatic Speech Recognition and Understanding Workshop", + "Authors": "A. Ali,S. Vogel,S. Renals", + "Affiliations": ",,", + "Abstract": "This paper describes the Arabic MGB-3 Challenge \u2014 Arabic Speech Recognition in the Wild. Unlike last year's Arabic MGB-2 Challenge, for which the recognition task was based on more than 1,200 hours broadcast TV news recordings from Aljazeera Arabic TV programs, MGB-3 emphasises dialectal Arabic using a multi-genre collection of Egyptian YouTube videos. Seven genres were used for the data collection: comedy, cooking, family/kids, fashion, drama, sports, and science (TEDx). A total of 16 hours of videos, split evenly across the different genres, were divided into adaptation, development and evaluation data sets. The Arabic MGB-Challenge comprised two tasks: A) Speech transcription, evaluated on the MGB-3 test set, along with the 10 hour MGB-2 test set to report progress on the MGB-2 evaluation; B) Arabic dialect identification, introduced this year in order to distinguish between four major Arabic dialects \u2014 Egyptian, Levantine, North African, Gulf, as well as Modern Standard Arabic. Two hours of audio per dialect were released for development and a further two hours were used for evaluation. For dialect identification, both lexical features and i-vector bottleneck features were shared with participants in addition to the raw audio recordings. Overall, thirteen teams submitted ten systems to the challenge. We outline the approaches adopted in each system, and summarise the evaluation results.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/mgb-3.json b/datasets/mgb-3.json new file mode 100644 index 0000000..8b618ca --- /dev/null +++ b/datasets/mgb-3.json @@ -0,0 +1,36 @@ +{ + "Name": "MGB-3", + "Subsets": [], + "HF Link": "nan", + "Link": "https://arabicspeech.org/mgb3-asr-2/", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "explores multi-genre data; comedy, cooking, cultural, environment, family-kids, fashion, movies-drama, sports, and science talks (TEDX)", + "Volume": "16", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "SPEECH RECOGNITION CHALLENGE IN THE WILD: ARABIC MGB-3\r", + "Paper Link": "https://arxiv.org/pdf/1709.07276.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "ASRU", + "Citations": "64.0", + "Venue Type": "workshop", + "Venue Name": "IEEE Automatic Speech Recognition and Understanding Workshop", + "Authors": "A. Ali,S. Vogel,S. Renals", + "Affiliations": ",,", + "Abstract": "This paper describes the Arabic MGB-3 Challenge \u2014 Arabic Speech Recognition in the Wild. Unlike last year's Arabic MGB-2 Challenge, for which the recognition task was based on more than 1,200 hours broadcast TV news recordings from Aljazeera Arabic TV programs, MGB-3 emphasises dialectal Arabic using a multi-genre collection of Egyptian YouTube videos. Seven genres were used for the data collection: comedy, cooking, family/kids, fashion, drama, sports, and science (TEDx). A total of 16 hours of videos, split evenly across the different genres, were divided into adaptation, development and evaluation data sets. The Arabic MGB-Challenge comprised two tasks: A) Speech transcription, evaluated on the MGB-3 test set, along with the 10 hour MGB-2 test set to report progress on the MGB-2 evaluation; B) Arabic dialect identification, introduced this year in order to distinguish between four major Arabic dialects \u2014 Egyptian, Levantine, North African, Gulf, as well as Modern Standard Arabic. Two hours of audio per dialect were released for development and a further two hours were used for evaluation. For dialect identification, both lexical features and i-vector bottleneck features were shared with participants in addition to the raw audio recordings. Overall, thirteen teams submitted ten systems to the challenge. We outline the approaches adopted in each system, and summarise the evaluation results.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/mgb-5.json b/datasets/mgb-5.json new file mode 100644 index 0000000..04a643a --- /dev/null +++ b/datasets/mgb-5.json @@ -0,0 +1,36 @@ +{ + "Name": "MGB-5", + "Subsets": [], + "HF Link": "nan", + "Link": "https://arabicspeech.org/mgb5/", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "Moroccan Arabic speech extracted from 93 YouTube videos distributed across seven genres: comedy, cooking, family/children, fashion, drama, sports, and science clips.", + "Volume": "14", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "The MGB-5 Challenge: Recognition and Dialect Identification of Dialectal Arabic Speech", + "Paper Link": "https://ieeexplore.ieee.org/document/9003960", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "ASRU", + "Citations": "18.0", + "Venue Type": "workshop", + "Venue Name": "IEEE Automatic Speech Recognition and Understanding Workshop", + "Authors": "A. Ali,Suwon Shon,Younes Samih,Hamdy Mubarak,Ahmed Abdelali,James R. Glass,S. Renals,K. Choukri", + "Affiliations": ",,University Of D\u00fcsseldorf;Computational Linguistics,,,,,", + "Abstract": "This paper describes the fifth edition of the Multi-Genre Broadcast Challenge (MGB-5), an evaluation focused on Arabic speech recognition and dialect identification. MGB-5 extends the previous MGB-3 challenge in two ways: first it focuses on Moroccan Arabic speech recognition; second the granularity of the Arabic dialect identification task is increased from 5 dialect classes to 17, by collecting data from 17 Arabic speaking countries. Both tasks use YouTube recordings to provide a multi-genre multi-dialectal challenge in the wild. Moroccan speech transcription used about 13 hours of transcribed speech data, split across training, development, and test sets, covering 7-genres: comedy, cooking, family/kids, fashion, drama, sports, and science (TEDx). The fine-grained Arabic dialect identification data was collected from known YouTube channels from 17 Arabic countries. 3,000 hours of this data was released for training, and 57 hours for development and testing. The dialect identification data was divided into three sub-categories based on the segment duration: short (under 5 s), medium (5\u201320 s), and long (>20 s). Overall, 25 teams registered for the challenge, and 9 teams submitted systems for the two tasks. We outline the approaches adopted in each system and summarize the evaluation results.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/microsoft_terminology_collection.json b/datasets/microsoft_terminology_collection.json new file mode 100644 index 0000000..3262d82 --- /dev/null +++ b/datasets/microsoft_terminology_collection.json @@ -0,0 +1,36 @@ +{ + "Name": "Microsoft Terminology Collection", + "Subsets": [], + "HF Link": "https://hf.co/datasets/microsoft/ms_terms", + "Link": "https://www.microsoft.com/en-us/language/terminology", + "License": "custom", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The Microsoft Terminology Collection can be used to develop localized versions of applications that integrate with Microsoft products. It can also be used to integrate Microsoft terminology into other terminology collections or serve as a base IT glossary for language development in the nearly 100 languages available. Terminology is provided in .tbx format, an industry standard for terminology exchange.", + "Volume": "20,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Microsoft", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mimic-it.json b/datasets/mimic-it.json new file mode 100644 index 0000000..6242c52 --- /dev/null +++ b/datasets/mimic-it.json @@ -0,0 +1,36 @@ +{ + "Name": "MIMIC-IT", + "Subsets": [], + "HF Link": "https://hf.co/datasets/pufanyi/MIMICIT", + "Link": "https://hf.co/datasets/pufanyi/MIMICIT", + "License": "MIT License", + "Year": 2023, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "images", + "Collection Style": "other", + "Description": "MIMIC-IT offers a diverse and extensive dataset of 2.8M multimodal instruction-response pairs, designed to enhance the performance of Vision-Language Models (VLMs) in real-life scenarios, enabling VLMs to excel in perception, reasoning, and planning while also catering to a multilingual audience.", + "Volume": "2,100,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "MIMIC-IT: Multi-Modal In-Context Instruction Tuning", + "Paper Link": "https://arxiv.org/pdf/2306.05425", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "multimodel instruction tuning", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "High-quality instructions and responses are essential for the zero-shot performance\nof large language models on interactive natural language tasks. For interactive\nvision-language tasks involving intricate visual scenes, a large quantity of diverse and creative instruction-response pairs should be imperative to tune visionlanguage models (VLMs). Nevertheless, the current availability of vision-language\ninstruction-response pairs in terms of quantity, diversity, and creativity remains limited, posing challenges to the generalization of interactive VLMs. Here we present\nMultI-Modal In-Context Instruction Tuning (MIMIC-IT), a dataset comprising\n2.8 million multimodal instruction-response pairs, with 2.2 million unique instructions derived from images and videos. Each pair is accompanied by multi-modal\nin-context information, forming conversational contexts aimed at empowering\nVLMs in perception, reasoning, and planning. The instruction-response collection\nprocess, dubbed as Syphus, is scaled using an automatic annotation pipeline that\ncombines human expertise with GPT\u2019s capabilities. Using the MIMIC-IT dataset,\nwe train a large VLM named Otter. Based on extensive evaluations conducted\non vision-language benchmarks, it has been observed that Otter demonstrates remarkable proficiency in multi-modal perception, reasoning, and in-context learning.\nHuman evaluation reveals it effectively aligns with the user\u2019s intentions. We release\nthe MIMIC-IT dataset, instruction-response collection pipeline, benchmarks, and\nthe Otter model.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/miracl.json b/datasets/miracl.json new file mode 100644 index 0000000..b61a4f1 --- /dev/null +++ b/datasets/miracl.json @@ -0,0 +1,36 @@ +{ + "Name": "MIRACL", + "Subsets": [], + "HF Link": "https://hf.co/datasets/miracl/miracl-corpus", + "Link": "https://hf.co/datasets/miracl/miracl-corpus", + "License": "Apache-2.0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "(Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval dataset that focuses on search across 18 different languages, which collectively encompass over three billion native speakers around the world.", + "Volume": "2,061,414", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mkqa.json b/datasets/mkqa.json new file mode 100644 index 0000000..fc7c17d --- /dev/null +++ b/datasets/mkqa.json @@ -0,0 +1,36 @@ +{ + "Name": "MKQA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/apple/mkqa", + "Link": "https://github.com/apple/ml-mkqa", + "License": "CC BY-SA 3.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "10k question-answer pairs aligned across 26 typologically diverse languages", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Apple", + "Derived From": "Natural Questions", + "Paper Title": "MKQA: A Linguistically Diverse Benchmark for\r\nMultilingual Open Domain Question Answering\r", + "Paper Link": "https://arxiv.org/pdf/2007.15207.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering ", + "Venue Title": "ArXiv", + "Citations": "11.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "S. Longpre,Yi Lu,Joachim Daiber", + "Affiliations": ",,Apple", + "Abstract": "Progress in cross-lingual modeling depends on challenging, realistic, and diverse evaluation sets. We introduce Multilingual Knowledge Questions and Answers (MKQA), an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages (260k question-answer pairs in total). The goal of this dataset is to provide a challenging benchmark for question answering quality across a wide set of languages. Answers are based on a language-independent data representation, making results comparable across languages and independent of language-specific passages. With 26 languages, this dataset supplies the widest range of languages to-date for evaluating question answering. We benchmark state-of-the-art extractive question answering baselines, trained on Natural Questions, including Multilingual BERT, and XLM-RoBERTa, in zero shot and translation settings. Results indicate this dataset is challenging, especially in low-resource languages.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/ml_spoken_words.json b/datasets/ml_spoken_words.json new file mode 100644 index 0000000..1ba0559 --- /dev/null +++ b/datasets/ml_spoken_words.json @@ -0,0 +1,36 @@ +{ + "Name": "ml_spoken_words", + "Subsets": [], + "HF Link": "https://hf.co/datasets/MLCommons/ml_spoken_words", + "Link": "https://mlcommons.org/en/multilingual-spoken-words/", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Multilingual Spoken Words Corpus (MSWC), a large and growing audio dataset of spoken words in 50 different languages.", + "Volume": "7.6", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Coqui, Factored, Google, Harvard University, Intel, Landing AI, NVIDIA, University of Michigan", + "Derived From": "Common Voice dataset", + "Paper Title": "Multilingual Spoken Words Corpus", + "Paper Link": "https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/file/fe131d7f5a6b38b23cc967316c13dae2-Paper-round2.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "keyword spotting, spoken term search", + "Venue Title": "NeurIPS", + "Citations": "1.0", + "Venue Type": "conference", + "Venue Name": "The Conference on Neural Information Processing Systems", + "Authors": "Mark Mazumder, Sharad Chitlangia, Colby Banbury, Yiping Kang, Juan Ciro, Keith Achorn, Daniel Galvez, Mark Sabini, Peter Mattson, David Kanter, Greg Diamos, Pete Warden, Josh Meyer, Vijay Janapa Reddi", + "Affiliations": "nan", + "Abstract": "Coqui, Factored, Google, Harvard University, Intel, Landing AI, NVIDIA, University of Michigan", + "Added By": "Wafaa Mohammed" +} \ No newline at end of file diff --git a/datasets/mldr.json b/datasets/mldr.json new file mode 100644 index 0000000..bb2ffba --- /dev/null +++ b/datasets/mldr.json @@ -0,0 +1,36 @@ +{ + "Name": "MLDR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Shitao/MLDR", + "Link": "https://hf.co/datasets/Shitao/MLDR", + "License": "MIT License", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "MLDR is a Multilingual Long-Document Retrieval dataset built on Wikipeida, Wudao and mC4, covering 13 typologically diverse languages. Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them. Then we use GPT-3.5 to generate questions based on these paragraphs. The generated question and the sampled article constitute a new text pair to the dataset. The prompt for GPT3.5 is \u201cYou are a curious AI assistant, please generate one specific and valuable question based on the following text.", + "Volume": "7,607", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "M3-Embedding: Multi-Linguality, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation", + "Paper Link": "https://arxiv.org/pdf/2402.03216", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "information retrieval", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "In this paper, we introduce a new embedding\nmodel called M3-Embedding, which is distinguished for its versatility in Multi-Linguality,\nMulti-Functionality, and Multi-Granularity. It\nprovides a uniform support for the semantic retrieval of more than 100 working languages. It\ncan simultaneously accomplish the three common retrieval functionalities: dense retrieval,\nmulti-vector retrieval, and sparse retrieval. Besides, it is also capable of processing inputs\nof different granularities, spanning from short\nsentences to long documents of up to 8,192 tokens. The effective training of M3-Embedding\npresents a series of technical contributions. Notably, we propose a novel self-knowledge distillation approach, where the relevance scores\nfrom different retrieval functionalities can be\nintegrated as the teacher signal to enhance\nthe training quality. We also optimize the\nbatching strategy, which enables a large batch\nsize and high training throughput to improve\nthe discriminativeness of embeddings. M3-\nEmbedding exhibits a superior performance in\nour experiment, leading to new state-of-the-art\nresults on multilingual, cross-lingua", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mlma_hate_speech.json b/datasets/mlma_hate_speech.json new file mode 100644 index 0000000..c60204b --- /dev/null +++ b/datasets/mlma_hate_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "MLMA hate speech", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/MLMA_hate_speech_ar", + "Link": "https://github.com/HKUST-KnowComp/MLMA_hate_speech", + "License": "MIT License", + "Year": 2019, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Multilingual and Multi-Aspect Hate Speech Analysis", + "Volume": "3,354", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "HKUST", + "Derived From": "nan", + "Paper Title": "Multilingual and Multi-Aspect Hate Speech Analysis\r", + "Paper Link": "https://aclanthology.org/D19-1474.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "hate speech detection, abusive language detection", + "Venue Title": "EMNLP", + "Citations": "57.0", + "Venue Type": "conference", + "Venue Name": "Conference on Empirical Methods in Natural Language Processing", + "Authors": "N. Ousidhoum,Zizheng Lin,Hongming Zhang,Y. Song,D. Yeung", + "Affiliations": ",,,,", + "Abstract": "Current research on hate speech analysis is typically oriented towards monolingual and single classification tasks. In this paper, we present a new multilingual multi-aspect hate speech analysis dataset and use it to test the current state-of-the-art multilingual multitask learning approaches. We evaluate our dataset in various classification settings, then we discuss how to leverage our annotations in order to improve hate speech detection and classification in general.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/mlqa.json b/datasets/mlqa.json new file mode 100644 index 0000000..ee50b94 --- /dev/null +++ b/datasets/mlqa.json @@ -0,0 +1,36 @@ +{ + "Name": "MLQA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/facebook/mlqa", + "Link": "https://github.com/facebookresearch/mlqa", + "License": "CC BY-SA 3.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese.", + "Volume": "5,852", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Facebook", + "Derived From": "nan", + "Paper Title": "MLQA: Evaluating Cross-lingual Extractive Question Answering\r\n", + "Paper Link": "https://arxiv.org/pdf/1910.07475.pdff", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "ACL", + "Citations": "124.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Patrick Lewis,Barlas O\u011fuz,Ruty Rinott,S. Riedel,Holger Schwenk", + "Affiliations": "Facebook AI Research;University College London,Facebook AI,,,", + "Abstract": "Question answering (QA) models have shown rapid progress enabled by the availability of large, high-quality benchmark datasets. Such annotated datasets are difficult and costly to collect, and rarely exist in languages other than English, making building QA systems that work well in other languages challenging. In order to develop such systems, it is crucial to invest in high quality multilingual evaluation benchmarks to measure progress. We present MLQA, a multi-way aligned extractive QA evaluation benchmark intended to spur research in this area. MLQA contains QA instances in 7 languages, English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA has over 12K instances in English and 5K in each other language, with each instance parallel between 4 languages on average. We evaluate state-of-the-art cross-lingual models and machine-translation-based baselines on MLQA. In all cases, transfer results are shown to be significantly behind training-language performance.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mmac.json b/datasets/mmac.json new file mode 100644 index 0000000..c3a4b58 --- /dev/null +++ b/datasets/mmac.json @@ -0,0 +1,36 @@ +{ + "Name": "MMAC", + "Subsets": [], + "HF Link": "nan", + "Link": "http://www.ashrafraouf.com/mmac", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "The multi-modal Arabic corpus contains 6 million Arabic words selected from various sources covering old Arabic, religious texts, traditional language, modern language, different specialisations and very modern material from online \u201cchat rooms.\u201d", + "Volume": "6,000,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Building a multi-modal Arabic corpus (MMAC)", + "Paper Link": "https://link.springer.com/content/pdf/10.1007/s10032-010-0128-2.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dictionary ", + "Venue Title": "IJDAR", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "International Journal on Document Analysis and Recognition ", + "Authors": "Ashraf AbdelRaouf, Colin A. Higgins, Tony Pridmore, Mahmoud Khalil", + "Affiliations": "Misr International University, The University of Nottingham, The University of Nottingham, , Ain Shams University ", + "Abstract": "Traditionally, a corpus is a large structured set\nof text, electronically stored and processed. Corpora have\nbecome very important in the study of languages. They\nhave opened new areas of linguistic research, which were\nunknown until recently. Corpora are also key to the development of optical character recognition (OCR) applications. Access to a corpus of both language and images is\nessential during OCR development, particularly while training and testing a recognition application. Excellent corpora\nhave been developed for Latin-based languages, but few\nrelate to the Arabic language. This limits the penetration of\nboth corpus linguistics and OCR in Arabic-speaking countries. This paper describes the construction and provides a\ncomprehensive study and analysis of a multi-modal Arabic\ncorpus (MMAC) that is suitable for use in both OCR development and linguistics. MMAC currently contains six million\nArabic words and, unlike previous corpora, also includes\nconnected segments or pieces of Arabic words (PAWs) as\nwell as naked pieces of Arabic words (NPAWs) and naked words (NWords); PAWs and Words without diacritical marks.\nMulti-modal data is generated from both text, gathered from\na wide variety of sources, and images of existing documents.\nText-based data is complemented by a set of artificially generated images showing each of the Words, NWords, PAWs\nand NPAWs involved. Applications are provided to generate a natural-looking degradation to the generated images.\nA ground truth annotation is offered for each such image,\nwhile natural images showing small paragraphs and full\npages are augmented with representations of the text they\ndepict. A statistical analysis and verification of the dataset\nhas been carried out and is presented. MMAC was also tested\nusing commercial OCR software and is publicly and freely\navailable.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/mmedc.json b/datasets/mmedc.json new file mode 100644 index 0000000..fff6baa --- /dev/null +++ b/datasets/mmedc.json @@ -0,0 +1,36 @@ +{ + "Name": "MMedC ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Henrychur/MMedC", + "Link": "https://hf.co/datasets/Henrychur/MMedC", + "License": "CC BY-NC-SA 4.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "a multilingual medical corpus with 25.5 billion tokens", + "Volume": "640,000,000", + "Unit": "tokens", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Towards Building Multilingual Language Model for Medicine", + "Paper Link": "https://arxiv.org/pdf/2402.13963", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "The development of open-source, multilingual medical language models can benefit a wide, linguistically\ndiverse audience from different regions. To promote this domain, we present contributions from the\nfollowing: First, we construct a multilingual medical corpus, containing approximately 25.5B tokens\nencompassing 6 main languages, termed as MMedC, enabling auto-regressive domain adaptation for\ngeneral LLMs; Second, to monitor the development of multilingual medical LLMs, we propose a multilingual\nmedical multi-choice question-answering benchmark with rationale, termed as MMedBench; Third, we\nhave assessed a number of open-source large language models (LLMs) on our benchmark, along with those\nfurther auto-regressive trained on MMedC. Our final model, MMed-Llama 3, with only 8B parameters,\nachieves superior performance compared to all other open-source models on both MMedBench and English\nbenchmarks, even rivaling GPT-4. In conclusion, in this work, We present a large-scale corpus, a benchmark\nand a series of models to support the development of multilingual medical LLMs.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/moarlex.json b/datasets/moarlex.json new file mode 100644 index 0000000..b204c06 --- /dev/null +++ b/datasets/moarlex.json @@ -0,0 +1,36 @@ +{ + "Name": "MoArLex", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/Mohabyoussef09/MoArLex", + "License": "CC BY 3.0", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "other", + "Description": "MoArLex is a large-scale Arabic sentiment lexicon developed by expanding an existing seed lexicon, NileULex, using word embeddings from AraVec (trained on Arabic Twitter data). The lexicon includes sentiment words commonly used in social media, including various forms of dialectal Arabic and even misspelled or emphasized words typical of social media communication. It covers positive, negative, and strong positive/negative sentiment labels.", + "Volume": "36,775", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Nile University", + "Derived From": "Seed lexicon NileULex and word embeddings model AraVec", + "Paper Title": "MoArLex: An Arabic Sentiment Lexicon Built Through Automatic Lexicon Expansion", + "Paper Link": "https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050918X00192/1-s2.0-S1877050918321665/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEC4aCXVzLWVhc3QtMSJGMEQCIAW9EYFtY1ONOGkDXMEJfB01H4LyM9JA%2FFhtMdJO61crAiAJdDKIosWmXuPsCznRaE1uKw%2B0UqVt9QlTFOoPsTVGmyqzBQhXEAUaDDA1OTAwMzU0Njg2NSIMEjyc7r8vKtFBEOkZKpAFmGb%2BxNOfuy7pBi5%2FVxPfSs8PLHT5RlRToOL%2BOUM6oPGjIMoWyIbXrOQSu2MTIx2EgIfkyH%2FSGzCw%2FDMLMn7a0ruVmuSqXDbvQJo7igBktf0SnmVqNYC%2F5uznmg0wyzDjW5SIkSlHcl%2FOgU34bPVRzPxQeEBxKLX%2B5u0sXCONIyI8u1mz4%2FxT7RxKSqwwAUEv6RSAfV8UoyVBMtnOPmqnKjBaFNwB5S%2FIdrsB0Rf7pLOFmCocTOrmeyJYIMotRYGR1W7BQV5GwRsS%2BOO%2FobnXxOa7zxygOpLnL9%2Fk7uI0Svk03sJX347PPbDjPVyd1dHfiKJFc2%2B9BZvb4Gm7HNBtPW0EfxpkOiV4SHx8qUUkJtE8AgiLImkQQmcBgftYgr26F2b2Ve7AecOi9Dbp%2BMkJVYXl0hTyz7AT9n9bgZ1CLmoOz0pavqwGLYXm0JPZPEcjDnLfAMLYfGEDb%2Fz54tSteCbQch2Ga6Gg50YKZVnqnSklSIdYia4d%2Fz2HVioywI7SfZdUb6BrSYp9DVd8M7vYd4O3TioG9KiSfnwABA8WAM52cxtegb9jzKLuGEdmnnm9yjBkNyD9TLgMXNINRPM8Ic3%2B%2BdnhNiJg9raqhrEJ67RU1C6S74noQ6%2FpTblwGDh9FIUNZl1hluU%2F11DbtIfup0fooaimrng4wx0Vm3AUHJ5eScokeQbT7OCMcAcePCPVhe3KS7vCx1kUOvcUCwRuJZgJGKl6aRhyZdsG8Ylf491LrNPxmq2kTGRHf8vSKiBMtYBsElyRj7HhtRNUvVp%2BUvfdlYr7qKG27viZcvJi%2BxWxAvTFXxmCv5%2BEB2N4nJOacmIXKFcxLBYNFGBa0O1xW%2FHh2N5ThAU7oOOOzp5ezOcwkJv6tgY6sgHVfvgavguNAfVkorFJ1Vwz5db4Jo5%2BC6M%2B%2FjqYJ2srjmFlh3SoZM%2FUSA7wnfzvUMM6Cq7VLGGlmrHlLnXC1n8AdBNzVrpNi0exnD4po4DenlUhePy6Dgc5bfx5jB2bFccXRzw%2BO2rxoeBlp6jZPKJMCYdr74cAlZc%2BbdRz793x6V6Shg42Jp2QWqyVD2NfbQFDvj1MJsw5N2G2v39UwmJefk%2FDIn819MMj81oljvzuEW1V&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240909T055657Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTYVEXIFTOC%2F20240909%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=bf7f76eaecad1ede8a46d8fb961b4439796762a1d125a40cfbb1c86fc9aeab6a&hash=a7d938a12e8726cb86d9966bc8c420919e9e4d1c59cf86c64117b307fa969f9c&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050918321665&tid=spdf-931269d2-6477-4ab8-9540-fafa769f5139&sid=0873a5bd51e572450a0af2f40180fda8c32fgxrqb&type=client&tsoh=d3d3LnNjaWVuY2VkaXJlY3QuY29t&ua=1f055a03575007555d&rr=8c04f02018cff0fc&cc=qa", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, opinion mining", + "Venue Title": "ACLing", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "4th International Conference on Arabic Computational Linguistics ", + "Authors": "Mohab Youssef and Samhaa R. El-Beltagy", + "Affiliations": "Nile University, Egypt", + "Abstract": "MoArLex presents a novel approach to building a large-scale Arabic sentiment lexicon by automatically expanding an existing seed lexicon (NileULex) using word embeddings. The lexicon is designed to be used for sentiment analysis tasks on Arabic social media data and includes dialectal and colloquial terms commonly used in online communication. It demonstrates high accuracy compared to lexicons generated through machine translation and other automated methods.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/moroccan_arabic_wikipedia_20230101_bots.json b/datasets/moroccan_arabic_wikipedia_20230101_bots.json new file mode 100644 index 0000000..6a3d9fb --- /dev/null +++ b/datasets/moroccan_arabic_wikipedia_20230101_bots.json @@ -0,0 +1,36 @@ +{ + "Name": "Moroccan_Arabic_Wikipedia_20230101_bots", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_bots", + "Link": "https://hf.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_bots", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Moroccan_Arabic_Wikipedia_20230101_bots is a dataset created using the Moroccan Arabic Wikipedia articles, including the bot-generated articles, downloaded on the 1st of January 2023, and processed to train a Moroccan Arabic RoBERTa model.", + "Volume": "5,400", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Clarkson University", + "Derived From": "Moroccan Arabic Wikipedia Dump 2023-01-01", + "Paper Title": "Performance Implications of Using Unrepresentative Corpora in Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.19.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "ArabicNLP 2023", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The First Arabic Natural Language Processing Conference", + "Authors": "Saied Alshahrani, Norah Alshahrani, Soumyabrata Dey, Jeanna Matthews", + "Affiliations": "Clarkson University", + "Abstract": "Wikipedia articles are a widely used source of training data for Natural Language Processing (NLP) research, particularly as corpora for low-resource languages like Arabic. However, it is essential to understand the extent to which these corpora reflect the representative contributions of native speakers, especially when many entries in a given language are directly translated from other languages or automatically generated through automated mechanisms. In this paper, we study the performance implications of using inorganic corpora that are not representative of native speakers and are generated through automated techniques such as bot generation or automated template-based translation. The case of the Arabic Wikipedia editions gives a unique case study of this since the Moroccan Arabic Wikipedia edition (ARY) is small but representative, the Egyptian Arabic Wikipedia edition (ARZ) is large but unrepresentative, and the Modern Standard Arabic Wikipedia edition (AR) is both large and more representative. We intrinsically evaluate the performance of two main NLP upstream tasks, namely word representation and language modeling, using word analogy evaluations and fill-mask evaluations using our two newly created datasets: Arab States Analogy Dataset (ASAD) and Masked Arab States Dataset (MASD). We demonstrate that for good NLP performance, we need both large and organic corpora; neither alone is sufficient. We show that producing large corpora through automated means can be a counter-productive, producing models that both perform worse and lack cultural richness and meaningful representation of the Arabic language and its native speakers.", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/moroccan_arabic_wikipedia_20230101_nobots.json b/datasets/moroccan_arabic_wikipedia_20230101_nobots.json new file mode 100644 index 0000000..a371b31 --- /dev/null +++ b/datasets/moroccan_arabic_wikipedia_20230101_nobots.json @@ -0,0 +1,36 @@ +{ + "Name": "Moroccan_Arabic_Wikipedia_20230101_nobots", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", + "Link": "https://hf.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", + "License": "MIT License", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "manual curation", + "Description": "Moroccan_Arabic_Wikipedia_20230101_nobots is a dataset created using the Moroccan Arabic Wikipedia articles, excluding the bot-generated articles, downloaded on the 1st of January 2023, and processed to train a Moroccan Arabic RoBERTa model.", + "Volume": "4,680", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Clarkson University", + "Derived From": "Moroccan Arabic Wikipedia Dump 2023-01-01", + "Paper Title": "Performance Implications of Using Unrepresentative Corpora in Arabic Natural Language Processing", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.19.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling", + "Venue Title": "ArabicNLP 2023", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "The First Arabic Natural Language Processing Conference", + "Authors": "Saied Alshahrani, Norah Alshahrani, Soumyabrata Dey, Jeanna Matthews", + "Affiliations": "Clarkson University", + "Abstract": "Wikipedia articles are a widely used source of training data for Natural Language Processing (NLP) research, particularly as corpora for low-resource languages like Arabic. However, it is essential to understand the extent to which these corpora reflect the representative contributions of native speakers, especially when many entries in a given language are directly translated from other languages or automatically generated through automated mechanisms. In this paper, we study the performance implications of using inorganic corpora that are not representative of native speakers and are generated through automated techniques such as bot generation or automated template-based translation. The case of the Arabic Wikipedia editions gives a unique case study of this since the Moroccan Arabic Wikipedia edition (ARY) is small but representative, the Egyptian Arabic Wikipedia edition (ARZ) is large but unrepresentative, and the Modern Standard Arabic Wikipedia edition (AR) is both large and more representative. We intrinsically evaluate the performance of two main NLP upstream tasks, namely word representation and language modeling, using word analogy evaluations and fill-mask evaluations using our two newly created datasets: Arab States Analogy Dataset (ASAD) and Masked Arab States Dataset (MASD). We demonstrate that for good NLP performance, we need both large and organic corpora; neither alone is sufficient. We show that producing large corpora through automated means can be a counter-productive, producing models that both perform worse and lack cultural richness and meaningful representation of the Arabic language and its native speakers.", + "Added By": "Saied Alshahrani" +} \ No newline at end of file diff --git a/datasets/mozilla_foundation_common_voice_dataset.json b/datasets/mozilla_foundation_common_voice_dataset.json new file mode 100644 index 0000000..0341d4d --- /dev/null +++ b/datasets/mozilla_foundation_common_voice_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "mozilla foundation common voice dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/mozilla-foundation/common_voice_17_0", + "Link": "https://github.com/common-voice/cv-dataset/tree/main/datasets", + "License": "MPL-2.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The Common Voice dataset consists of a unique MP3 and corresponding text file. Many of the 20217 recorded hours in the dataset also include demographic metadata like age, sex, and accent that can help improve the accuracy of speech recognition engines.", + "Volume": "87", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Mozilla, Indiana University, Artie, Inc.", + "Derived From": "nan", + "Paper Title": "Common Voice: A Massively-Multilingual Speech Corpus", + "Paper Link": "https://arxiv.org/pdf/1912.06670.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition, language identification", + "Venue Title": "LREC", + "Citations": "369.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Rosana Ardila, Megan Branson, Kelly Davis, Michael Henretty, Michael Kohler, Josh Meyer, Reuben Morais, Lindsay Saunders, Francis M. Tyers, Gregor Weber", + "Affiliations": "Mozilla, Indiana University, Artie, Inc.", + "Abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language\nidentification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data.\nOver 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use\ncase for Common Voice, we present speech recognition experiments using Mozilla\u2019s DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 \u00b1 5.48 for twelve target\nlanguages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition.", + "Added By": "Wafaa Mohammed" +} \ No newline at end of file diff --git a/datasets/mpold__multi_platforms_offensive_language_dataset.json b/datasets/mpold__multi_platforms_offensive_language_dataset.json new file mode 100644 index 0000000..3065dbe --- /dev/null +++ b/datasets/mpold__multi_platforms_offensive_language_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "MPOLD: Multi Platforms Offensive Language Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/MPOLD", + "Link": "https://github.com/shammur/Arabic-Offensive-Multi-Platform-SocialMedia-Comment-Dataset", + "License": "Apache-2.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Arabic Offensive Comments dataset from Multiple Social Media Platforms", + "Volume": "400", + "Unit": "documents", + "Ethical Risks": "Medium", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "A Multi-Platform Arabic News Comment Dataset for Offensive Language Detection", + "Paper Link": "https://aclanthology.org/2020.lrec-1.761.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "offensive language detection", + "Venue Title": "LREC", + "Citations": "10.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "S. A. Chowdhury,Hamdy Mubarak,Ahmed Abdelali,Soon-Gyo Jung,B. Jansen,Joni O. Salminen", + "Affiliations": ",,,,,", + "Abstract": "Access to social media often enables users to engage in conversation with limited accountability. This allows a user to share their opinions and ideology, especially regarding public content, occasionally adopting offensive language. This may encourage hate crimes or cause mental harm to targeted individuals or groups. Hence, it is important to detect offensive comments in social media platforms. Typically, most studies focus on offensive commenting in one platform only, even though the problem of offensive language is observed across multiple platforms. Therefore, in this paper, we introduce and make publicly available a new dialectal Arabic news comment dataset, collected from multiple social media platforms, including Twitter, Facebook, and YouTube. We follow two-step crowd-annotator selection criteria for low-representative language annotation task in a crowdsourcing platform. Furthermore, we analyze the distinctive lexical content along with the use of emojis in offensive comments. We train and evaluate the classifiers using the annotated multi-platform dataset along with other publicly available data. Our results highlight the importance of multiple platform dataset for (a) cross-platform, (b) cross-domain, and (c) cross-dialect generalization of classifier performance.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/mr__tydi.json b/datasets/mr__tydi.json new file mode 100644 index 0000000..3e3142c --- /dev/null +++ b/datasets/mr__tydi.json @@ -0,0 +1,36 @@ +{ + "Name": "Mr. TyDi", + "Subsets": [], + "HF Link": "https://hf.co/datasets/castorini/mr-tydi", + "Link": "https://github.com/castorini/mr.tydi", + "License": "unknown", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Mr. TYDI is constructed from TYDI, a question answering dataset covering\r\neleven typologically diverse languages", + "Volume": "16573", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Waterloo", + "Derived From": "TYDI", + "Paper Title": "Mr. TYDI: A Multi-lingual Benchmark for Dense Retrieval\r", + "Paper Link": "https://arxiv.org/pdf/2108.08787.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "ArXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Xinyu Zhang,Jimmy J. Lin", + "Affiliations": ",", + "Abstract": "We present Mr. TYDI, a multi-lingual benchmark dataset for mono-lingual retrieval in eleven typologically diverse languages, designed to evaluate ranking with learned dense representations. The goal of this resource is to spur research in dense retrieval techniques in non-English languages, motivated by recent observations that existing techniques for representation learning perform poorly when applied to out-of-distribution data. As a starting point, we provide zero-shot baselines for this new dataset based on a multi-lingual adaptation of DPR that we call \u201cmDPR\u201d. Experiments show that although the effectiveness of mDPR is much lower than BM25, dense representations nevertheless appear to provide valuable relevance signals, improving BM25 results in sparse\u2013dense hybrids. In addition to analyses of our results, we also discuss future challenges and present a research agenda in multi-lingual dense retrieval. Mr. TYDI can be downloaded at https://github.com/ castorini/mr.tydi.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/msac.json b/datasets/msac.json new file mode 100644 index 0000000..e4b3e0a --- /dev/null +++ b/datasets/msac.json @@ -0,0 +1,36 @@ +{ + "Name": "MSAC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/MSAC", + "Link": "https://github.com/ososs/Arabic-Sentiment-Analysis-corpus", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "rich and publicly available Arabic corpus called Moroccan Sentiment Analysis Corpus (MSAC)", + "Volume": "2,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Ibn Tofail University", + "Derived From": "nan", + "Paper Title": "ASA: A framework for Arabic sentiment analysis", + "Paper Link": "https://dl.acm.org/doi/abs/10.1177/0165551519849516", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "JIS", + "Citations": "20.0", + "Venue Type": "journal", + "Venue Name": "Journal of Information Science", + "Authors": "Ahmed Oussous,Fatima-Zahra Benjelloun,A. A. Lahcen,Samir Belfkih", + "Affiliations": ",,,", + "Abstract": "Sentiment analysis (SA), also known as opinion mining, is a growing important research area. Generally, it helps to automatically determine if a text expresses a positive, negative or neutral sentiment. It enables to mine the huge increasing resources of shared opinions such as social networks, review sites and blogs. In fact, SA is used by many fields and for various languages such as English and Arabic. However, since Arabic is a highly inflectional and derivational language, it raises many challenges. In fact, SA of Arabic text should handle such complex morphology. To better handle these challenges, we decided to provide the research community and Arabic users with a new efficient framework for Arabic Sentiment Analysis (ASA). Our primary goal is to improve the performance of ASA by exploiting deep learning while varying the preprocessing techniques. For that, we implement and evaluate two deep learning models namely convolutional neural network (CNN) and long short-term memory (LSTM) models. The framework offers various preprocessing techniques for ASA (including stemming, normalisation, tokenization and stop words). As a result of this work, we first provide a new rich and publicly available Arabic corpus called Moroccan Sentiment Analysis Corpus (MSAC). Second, the proposed framework demonstrates improvement in ASA. In fact, the experimental results prove that deep learning models have a better performance for ASA than classical approaches (support vector machines, naive Bayes classifiers and maximum entropy). They also show the key role of morphological features in Arabic Natural Language Processing (NLP).", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/msda.json b/datasets/msda.json new file mode 100644 index 0000000..4f44730 --- /dev/null +++ b/datasets/msda.json @@ -0,0 +1,36 @@ +{ + "Name": "MSDA", + "Subsets": [], + "HF Link": "nan", + "Link": "https://msda.um6p.ma/msda_datasets", + "License": "CC BY 2.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "tweets anotated for sentiment analysis and topic detection", + "Volume": "50,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "MSDA-UM6P", + "Derived From": "nan", + "Paper Title": "An open access NLP dataset for Arabic dialects : Data collection, labeling, and model construction", + "Paper Link": "https://arxiv.org/abs/2102.11000", + "Script": "Arab", + "Tokenized": "No", + "Host": "OneDrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "saad benjelloun" +} \ No newline at end of file diff --git a/datasets/mtvqa.json b/datasets/mtvqa.json new file mode 100644 index 0000000..e9917d4 --- /dev/null +++ b/datasets/mtvqa.json @@ -0,0 +1,36 @@ +{ + "Name": "MTVQA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/ByteDance/MTVQA", + "Link": "https://hf.co/datasets/ByteDance/MTVQA", + "License": "CC BY-NC 4.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "images", + "Collection Style": "other", + "Description": "The dataset is oriented toward visual question answering of multilingual text scenes in nine languages, including Korean, Japanese, Italian, Russian, Deutsch, French, Thai, Arabic, and Vietnamese.", + "Volume": "818", + "Unit": "images", + "Ethical Risks": "Low", + "Provider": "Bytedance", + "Derived From": "nan", + "Paper Title": "MTVQA: Benchmarking Multilingual Text-Centric Visual Question Answering", + "Paper Link": "https://arxiv.org/pdf/2405.11985", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "image captioning", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Text-Centric Visual Question Answering (TEC-VQA) in its proper format not\nonly facilitates human-machine interaction in text-centric visual environments but\nalso serves as a de facto gold proxy to evaluate AI models in the domain of textcentric scene understanding. Nonetheless, most existing TEC-VQA benchmarks\nhave focused on high-resource languages like English and Chinese. Despite pioneering works to expand multilingual QA pairs in non-text-centric VQA datasets\nthrough translation engines, the translation-based protocol encounters a substantial\n\u201cvisual-textual misalignment\u201d problem when applied to TEC-VQA. Specifically,\nit prioritizes the text in question-answer pairs while disregarding the visual text\npresent in images. Moreover, it fails to address complexities related to nuanced\nmeaning, contextual distortion, language bias, and question-type diversity. In\nthis work, we tackle multilingual TEC-VQA by introducing MTVQA, the first\nbenchmark featuring high-quality human expert annotations across 9 diverse languages, consisting of 6,778 question-answer pairs across 2,116 images. Further,\nby comprehensively evaluating numerous state-of-the-art Multimodal Large Language Models (MLLMs), including GPT-4o, GPT-4V, Claude3, and Gemini, on\nthe MTVQA dataset, it is evident that there is still a large room for performance\nimprovement, underscoring the value of MTVQA. Additionally, we supply multilingual training data within the MTVQA dataset, demonstrating that straightforward\nfine-tuning with this data can substantially enhance multilingual TEC-VQA performance. We aspire that MTVQA will offer the research community fresh insights\nand stimulate further exploration in multilingual visual text comprehension. The\nproject homepage is at https://bytedance.github.io/MTVQA/", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multi-language_conversational_telephone_speech_2011_--_arabic_group.json b/datasets/multi-language_conversational_telephone_speech_2011_--_arabic_group.json new file mode 100644 index 0000000..ec395d7 --- /dev/null +++ b/datasets/multi-language_conversational_telephone_speech_2011_--_arabic_group.json @@ -0,0 +1,55 @@ +{ + "Name": "Multi-Language Conversational Telephone Speech 2011 -- Arabic Group", + "Subsets": [ + { + "Name": "iraqi", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "37.4", + "Unit": "hours" + }, + { + "Name": "levantine", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "41.1", + "Unit": "hours" + }, + { + "Name": "maghrebi", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "38.6", + "Unit": "hours" + } + ], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2019S02", + "License": "LDC User Agreement for Non-Members", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Participants were recruited by native speakers who contacted acquaintances in their social network. Those native speakers made one call, up to 15 minutes, to each acquaintance. The data was collected using LDC's telephone collection infrastructure, comprised of three computer telephony systems. Human auditors labeled calls for callee gender, dialect type and noise. Demographic information about the participants was not collected.", + "Volume": "37.4", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,500.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git "a/datasets/multilingual_dictionary_of_sports__\342\200\223_english-french-arabic_trilingual_database.json" "b/datasets/multilingual_dictionary_of_sports__\342\200\223_english-french-arabic_trilingual_database.json" new file mode 100644 index 0000000..2db9538 --- /dev/null +++ "b/datasets/multilingual_dictionary_of_sports__\342\200\223_english-french-arabic_trilingual_database.json" @@ -0,0 +1,36 @@ +{ + "Name": "Multilingual Dictionary of Sports: \u2013 English-French-Arabic trilingual database", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-T0372_04/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This dictionary was produced within the French national project EuRADic (European and Arabic Dictionaries and Corpora), as part of the Technolangue programme funded by the French Ministry of Industry.", + "Volume": "40,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "French Ministry of Industry", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "200.00\u20ac", + "Test Split": "No", + "Tasks": "dictionary ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git "a/datasets/multilingual_hate\r\nspeech_detection_dataset.json" "b/datasets/multilingual_hate\r\nspeech_detection_dataset.json" new file mode 100644 index 0000000..603bd21 --- /dev/null +++ "b/datasets/multilingual_hate\r\nspeech_detection_dataset.json" @@ -0,0 +1,36 @@ +{ + "Name": "Multilingual Hate\r\nSpeech Detection Dataset", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/hate-alert/DE-LIMIT/tree/master/Dataset", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Combined MLMA and L-HSAB datasets", + "Volume": "5,790", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Indian Institute of Technology Kharagpur", + "Derived From": "L-HSAB,MLMA ", + "Paper Title": "Deep Learning Models for Multilingual Hate\r\nSpeech Detection", + "Paper Link": "https://arxiv.org/pdf/2004.06465.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "hate speech detection", + "Venue Title": "ArXiv", + "Citations": "27.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Sai Saket Aluru,Binny Mathew,Punyajoy Saha,Animesh Mukherjee", + "Affiliations": ",Indian Institute of Technology Kharagpur,,", + "Abstract": "Hate speech detection is a challenging problem with most of the datasets available in only one language: English. In this paper, we conduct a large scale analysis of multilingual hate speech in 9 languages from 16 different sources. We observe that in low resource setting, simple models such as LASER embedding with logistic regression performs the best, while in high resource setting BERT based models perform better. In case of zero-shot classification, languages such as Italian and Portuguese achieve good results. Our proposed framework could be used as an efficient solution for low-resource languages. These models could also act as good baselines for future multilingual hate speech detection tasks. We have made our code and experimental settings public for other researchers at this https URL.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multilingual_lama.json b/datasets/multilingual_lama.json new file mode 100644 index 0000000..776c7cc --- /dev/null +++ b/datasets/multilingual_lama.json @@ -0,0 +1,36 @@ +{ + "Name": "Multilingual LAMA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/cis-lmu/m_lama", + "Link": "https://github.com/norakassner/mlama", + "License": "CC BY-NC 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "multilingual version of lama. The underlying idea of LAMA is to query knowledge from pretrained LMs using templates without any finetuning", + "Volume": "19,354", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LMU Munich", + "Derived From": "nan", + "Paper Title": "Multilingual LAMA: Investigating Knowledge in Multilingual Pretrained Language Models", + "Paper Link": "https://arxiv.org/pdf/2102.00894.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "lm probing", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Nora Kassner, Philipp Dufter, Hinrich Schutze", + "Affiliations": "Center for Information and Language Processing (CIS), LMU Munich", + "Abstract": "Recently, it has been found that monolingual English language models can be used as\nknowledge bases. Instead of structural knowledge base queries, masked sentences such as\n\u201cParis is the capital of [MASK]\u201d are used as\nprobes. We translate the established benchmarks TREx and GoogleRE into 53 languages.\nWorking with mBERT, we investigate three\nquestions. (i) Can mBERT be used as a multilingual knowledge base? Most prior work only\nconsiders English. Extending research to multiple languages is important for diversity and\naccessibility. (ii) Is mBERT\u2019s performance\nas knowledge base language-independent or\ndoes it vary from language to language? (iii)\nA multilingual model is trained on more text,\ne.g., mBERT is trained on 104 Wikipedias.\nCan mBERT leverage this for better performance? We find that using mBERT as a knowledge base yields varying performance across\nlanguages and pooling predictions across languages improves performance. Conversely,\nmBERT exhibits a language bias; e.g., when\nqueried in Italian, it tends to predict Italy as\nthe country of origin.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multilingual_reward_bench.json b/datasets/multilingual_reward_bench.json new file mode 100644 index 0000000..6d99964 --- /dev/null +++ b/datasets/multilingual_reward_bench.json @@ -0,0 +1,36 @@ +{ + "Name": "Multilingual Reward Bench", + "Subsets": [], + "HF Link": "https://hf.co/datasets/C4AI-Community/multilingual-reward-bench", + "Link": "https://hf.co/datasets/C4AI-Community/multilingual-reward-bench", + "License": "unknown", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "Reward models (RMs) have driven the development of state-of-the-art LLMs today, with unprecedented impact across the globe. However, their performance in multilingual settings still remains understudied. In order to probe reward model behavior on multilingual data, we present M-RewardBench, a benchmark for 23 typologically diverse languages. M-RewardBench contains prompt-chosen-rejected preference triples obtained by curating and translating chat, safety, and reasoning instances from RewardBench (Lambert et al., 2024)", + "Volume": "8,110", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "C4AI", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "reward modelling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multilingual_tts.json b/datasets/multilingual_tts.json new file mode 100644 index 0000000..a95c3ee --- /dev/null +++ b/datasets/multilingual_tts.json @@ -0,0 +1,36 @@ +{ + "Name": "Multilingual TTS", + "Subsets": [], + "HF Link": "https://hf.co/datasets/MohamedRashad/multilingual-tts", + "Link": "https://hf.co/datasets/MohamedRashad/multilingual-tts", + "License": "LGPL-3.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The Multilingual TTS dataset is an exceptional compilation of text-to-speech (TTS) samples, meticulously crafted to showcase the richness and diversity of human languages. This dataset encompasses a variety of real-world sentences in fifteen prominent languages, carefully chosen to reflect global linguistic diversity. Each sample is accompanied by its corresponding high-quality audio output.", + "Volume": "1,938", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text to speech", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multiple-translation_arabic_(mta)_part_1.json b/datasets/multiple-translation_arabic_(mta)_part_1.json new file mode 100644 index 0000000..65f4d4d --- /dev/null +++ b/datasets/multiple-translation_arabic_(mta)_part_1.json @@ -0,0 +1,36 @@ +{ + "Name": "Multiple-Translation Arabic (MTA) Part 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2003T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2003, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "Here's a breakdown of the data amounts by source contained in this corpus:", + "Volume": "141", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "cross-lingual information retrieval,language teaching,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multiple-translation_arabic_(mta)_part_2.json b/datasets/multiple-translation_arabic_(mta)_part_2.json new file mode 100644 index 0000000..942a346 --- /dev/null +++ b/datasets/multiple-translation_arabic_(mta)_part_2.json @@ -0,0 +1,36 @@ +{ + "Name": "Multiple-Translation Arabic (MTA) Part 2", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005T05", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "All source data was drawn from January and February 2003. Here's a breakdown of the data amounts by source contained in this corpus:", + "Volume": "100", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "cross-lingual information retrieval,language teaching,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multitacred.json b/datasets/multitacred.json new file mode 100644 index 0000000..60460b1 --- /dev/null +++ b/datasets/multitacred.json @@ -0,0 +1,36 @@ +{ + "Name": "MultiTACRED", + "Subsets": [], + "HF Link": "https://hf.co/datasets/DFKI-SLT/multitacred", + "Link": "https://hf.co/datasets/DFKI-SLT/multitacred", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "MultiTACRED is a multilingual version of the large-scale TAC Relation Extraction Dataset. It covers 12 typologically diverse languages from 9 language families, and was created by the Speech & Language Technology group of DFKI by machine-translating the instances of the original TACRED dataset and automatically projecting their entity annotations. ", + "Volume": "105,663", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "TACRED", + "Paper Title": "MultiTACRED: A Multilingual Version of the TAC Relation Extraction Dataset", + "Paper Link": "https://aclanthology.org/2023.acl-long.210.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "relation extraction", + "Venue Title": "ACL", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Associations of computation linguistics", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Relation extraction (RE) is a fundamental\ntask in information extraction, whose extension to multilingual settings has been hindered\nby the lack of supervised resources comparable in size to large English datasets such as\nTACRED (Zhang et al., 2017). To address this\ngap, we introduce the MultiTACRED dataset,\ncovering 12 typologically diverse languages\nfrom 9 language families, which is created by\nmachine-translating TACRED instances and\nautomatically projecting their entity annotations. We analyze translation and annotation\nprojection quality, identify error categories, and\nexperimentally evaluate fine-tuned pretrained\nmono- and multilingual language models in\ncommon transfer learning scenarios. Our analyses show that machine translation is a viable\nstrategy to transfer RE instances, with native\nspeakers judging more than 83% of the translated instances to be linguistically and semantically acceptable. We find monolingual RE\nmodel performance to be comparable to the\nEnglish original for many of the target languages, and that multilingual models trained\non a combination of English and target language data can outperform their monolingual\ncounterparts. However, we also observe a variety of translation and annotation projection\nerrors, both due to the MT systems and linguistic features of the target languages, such\nas pronoun-dropping, compounding and inflection, that degrade", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/multiun_v2.json b/datasets/multiun_v2.json new file mode 100644 index 0000000..827483b --- /dev/null +++ b/datasets/multiun_v2.json @@ -0,0 +1,36 @@ +{ + "Name": "MultiUN v2", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/multiun", + "Link": "http://www.euromatrixplus.net/multi-un/", + "License": "unknown", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": " 6 official languages of the UN, consisting of around 300 million words per language", + "Volume": "65,156", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "DFKI", + "Derived From": "nan", + "Paper Title": "MultiUN: A Multilingual Corpus from United Nation Documents\r", + "Paper Link": "https://www.dfki.de/fileadmin/user_upload/import/4790_686_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "223.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "A. Eisele", + "Affiliations": "nan", + "Abstract": "This paper describes the acquisition, preparation and properties of a corpus extracted from the official documents of the United Nations (UN). This corpus is available in all 6 official languages of the UN, consisting of around 300 million words per language. We describe the methods we used for crawling, document formatting, and sentence alignment. This corpus also includes a common test set for machine translation. We present the results of a French-Chinese machine translation experiment performed on this corpus.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/munazarat_1_0.json b/datasets/munazarat_1_0.json new file mode 100644 index 0000000..6874c19 --- /dev/null +++ b/datasets/munazarat_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Munazarat 1.0", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Munazarat", + "Link": "https://github.com/moh72y/Munazarat1.0/tree/main", + "License": "unknown", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The Munazarat 1.0 Corpus is a unique resource for researchers interested in various aspects of Arabic competitive debating, Arabic linguistics studies, argumentation studies, education, and Arabic Natural Language Processing (NLP). It consists of approximately 50 hours of transcribed competitive debates, hosted by QatarDebate, covering university and school-level debates held between 2013 and 2023.", + "Volume": "73", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Munazarat 1.0: A Corpus of Arabic Competitive Debates", + "Paper Link": "https://aclanthology.org/2024.osact-1.3.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "debate analysis", + "Venue Title": "OSACT", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "The Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Mohammad Khader, AbdulGabbar Al-Sharafi, Mohamad Hamza Al-Sioufy, Wajdi Zaghouani and Ali Al-Zawqari", + "Affiliations": "nan", + "Abstract": "This paper introduces the Corpus of Arabic Competitive Debates, Munazarat. Despite the significance of\ncompetitive debating in fostering critical thinking and promoting dialogue, researchers in the fields of Arabic\nNatural Language Processing (NLP), linguistics, argumentation studies, and education have limited access\nto datasets on competitive debating. At this stage of the study, we introduce Munazarat 1.0, which combines\ntranscribed recordings of approximately 50 hours from 73 debates at QatarDebate-recognized tournaments,\nall available on YouTube. Munazarat is a novel specialized Arabic speech corpus, predominantly in Modern\nStandard Arabic (MSA), covering diverse debating topics and accompanied by metadata for each debate. The\ntranscription of debates was performed using Fenek, a speech-to-text Kanari AI tool, and reviewed by three native\nArabic speakers to enhance quality. The Munazarat 1.0 dataset can serve as a valuable resource for training\nArabic NLP tools, developing argumentation mining machines, and analyzing Arabic argumentation and rhetoric styles.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nabra.json b/datasets/nabra.json new file mode 100644 index 0000000..2072875 --- /dev/null +++ b/datasets/nabra.json @@ -0,0 +1,36 @@ +{ + "Name": "Nabra", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/currasat/", + "License": "unknown", + "Year": 2023, + "Language": "multilingual", + "Dialect": "ar-SY: (Arabic (Syria))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": " a corpora of Syrian Arabic dialects with morphological annotations.", + "Volume": "60,000", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Nabra: Syrian Arabic Dialects with Morphological Annotations", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.2.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "morphological analysis", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "Amal Nayouf,Tymaa Hasanain Hammouda\nMustafa Jarrar,\nFadi A. Zaraket,\nMohamad-Bassam Kurdy", + "Affiliations": "nan", + "Abstract": "This paper presents N\u00e2b\u00afra, a corpora\nof Syrian Arabic dialects with morphological annotations. A team of Syrian natives\ncollected more than 6K sentences containing\nabout 60K words from several sources including social media posts, scripts of movies and\nseries, lyrics of songs and local proverbs to\nbuild N\u00e2b\u00afra. N\u00e2b\u00afra covers several local Syrian dialects including those of Aleppo, Damascus, Deir-ezzur, Hama, Homs, Huran, Latakia,\nMardin, Raqqah, and Suwayda. A team of\nnine annotators annotated the 60K tokens with\nfull morphological annotations across sentence\ncontexts. We trained the annotators to follow\nmethodological annotation guidelines to ensure\nunique morpheme annotations, and normalized\nthe annotations. F1 and \u03ba agreement scores\nranged between 74% and 98% across features,\nshowing the excellent quality of N\u00e2b\u00afra annotations. Our corpora are open-source and publicly available as part of the Currasat portal\nhttps://sina.birzeit.edu/currasat.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nada.json b/datasets/nada.json new file mode 100644 index 0000000..52770bd --- /dev/null +++ b/datasets/nada.json @@ -0,0 +1,36 @@ +{ + "Name": "NADA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/NADA", + "Link": "https://www.researchgate.net/publication/326060650_NADA_A_New_Arabic_Dataset", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "NADA corpus is collected from two existing corpora,\r\nwhich are Diab Dataset DAA corpus and OSAC corpus.", + "Volume": "13,066", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "NADA: New Arabic Dataset for Text Classification\r", + "Paper Link": "https://thesai.org/Downloads/Volume9No9/Paper_28-NADA_New_Arabic_Dataset_for_Text_Classification.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "ResearchGate", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "topic classification", + "Venue Title": "IJACSA", + "Citations": "8.0", + "Venue Type": "journal", + "Venue Name": "International Journal of Advanced Computer Science and Applications", + "Authors": "Nada Alalyani,Souad Larabi Marie-Sainte", + "Affiliations": ",", + "Abstract": "In the recent years, Arabic Natural Language Processing, including Text summarization, Text simplification, Text Categorization and other Natural Language-related disciplines, are attracting more researchers. Appropriate resources for Arabic Text Categorization are becoming a big necessity for the development of this research. The few existing corpora are not ready for use, they require preprocessing and filtering operations. In addition, most of them are not organized based on standard classification methods which makes unbalanced classes and thus reduced the classification accuracy. This paper proposes a New Arabic Dataset (NADA) for Text Categorization purpose. This corpus is composed of two existing corpora OSAC and DAA. The new corpus is preprocessed and filtered using the recent state of the art methods. It is also organized based on Dewey decimal classification scheme and Synthetic Minority Over-Sampling Technique. The experiment results show that NADA is an efficient dataset ready for use in Arabic Text Categorization.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nadi-2020.json b/datasets/nadi-2020.json new file mode 100644 index 0000000..ac9f4a3 --- /dev/null +++ b/datasets/nadi-2020.json @@ -0,0 +1,163 @@ +{ + "Name": "NADI-2020", + "Subsets": [ + { + "Name": "Algeria", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "2,214", + "Unit": "sentences" + }, + { + "Name": "Bahrain", + "Dialect": "ar-BH: (Arabic (Bahrain))", + "Volume": "238", + "Unit": "sentences" + }, + { + "Name": "Djibouti", + "Dialect": "ar-DJ: (Arabic (Djibouti))", + "Volume": "271", + "Unit": "sentences" + }, + { + "Name": "Egypt", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "6,635", + "Unit": "sentences" + }, + { + "Name": "Iraq", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "3,816", + "Unit": "sentences" + }, + { + "Name": "Jordan", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "634", + "Unit": "sentences" + }, + { + "Name": "Kuwait", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Volume": "592", + "Unit": "sentences" + }, + { + "Name": "Lebanon", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "905", + "Unit": "sentences" + }, + { + "Name": "Libya", + "Dialect": "ar-LY: (Arabic (Libya))", + "Volume": "1,600", + "Unit": "sentences" + }, + { + "Name": "Mauritania", + "Dialect": "ar-MR: (Arabic (Mauritania))", + "Volume": "255", + "Unit": "sentences" + }, + { + "Name": "Morocco", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "1,579", + "Unit": "sentences" + }, + { + "Name": "Oman", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "1,615", + "Unit": "sentences" + }, + { + "Name": "Palestine", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "624", + "Unit": "sentences" + }, + { + "Name": "Qatar ", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "399", + "Unit": "sentences" + }, + { + "Name": "Saudi Arabia", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "3,455", + "Unit": "sentences" + }, + { + "Name": "Somalia", + "Dialect": "ar-SO: (Arabic (Somalia))", + "Volume": "312", + "Unit": "sentences" + }, + { + "Name": "Sudan", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "312", + "Unit": "sentences" + }, + { + "Name": "Syria", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "1,595", + "Unit": "sentences" + }, + { + "Name": "Tunisia", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "1,122", + "Unit": "sentences" + }, + { + "Name": "UAE", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "1,548", + "Unit": "sentences" + }, + { + "Name": "Yemen", + "Dialect": "ar-YE: (Arabic (Yemen))", + "Volume": "1,236", + "Unit": "sentences" + } + ], + "HF Link": "nan", + "Link": "https://sites.google.com/view/nadi-shared-task", + "License": "custom", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The data for the shared task covers a total of 100 provinces from 21 Arab countries and are collected from the Twitter domain", + "Volume": "30,957", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "NADI 2020: The First Nuanced Arabic Dialect Identification Shared Task\r", + "Paper Link": "https://arxiv.org/pdf/2010.11334.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "dialect identification", + "Venue Title": "WANLP", + "Citations": "38.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Muhammad Abdul-Mageed,Chiyu Zhang,Houda Bouamor,Nizar Habash", + "Affiliations": ",The University of British Columbia,,", + "Abstract": "We present the results and findings of the First Nuanced Arabic Dialect Identification Shared Task (NADI). The shared task includes two subtasks: country level dialect identification (Subtask 1) and province level sub-dialect identification (Subtask 2). The data for the shared task covers a total of 100 provinces from 21 Arab countries, and are collected from the Twitter domain. As such, NADI is the first shared task to target naturally-occurring fine-grained dialectal text at the sub-country level. A total of 61 teams from 25 countries registered to participate in the tasks, thus reflecting the interest of the community in this area. We received 47 submissions for Subtask 1 from 18 teams and 9 submissions to Subtask 2 from 9 teams.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/nadi-2021.json b/datasets/nadi-2021.json new file mode 100644 index 0000000..66fbd4c --- /dev/null +++ b/datasets/nadi-2021.json @@ -0,0 +1,163 @@ +{ + "Name": "NADI-2021", + "Subsets": [ + { + "Name": "Algeria", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "2,765", + "Unit": "sentences" + }, + { + "Name": "Bahrain", + "Dialect": "ar-BH: (Arabic (Bahrain))", + "Volume": "313", + "Unit": "sentences" + }, + { + "Name": "Djibouti", + "Dialect": "ar-DJ: (Arabic (Djibouti))", + "Volume": "314", + "Unit": "sentences" + }, + { + "Name": "Egypt", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "6,241", + "Unit": "sentences" + }, + { + "Name": "Iraq", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "4,042", + "Unit": "sentences" + }, + { + "Name": "Jordan", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "627", + "Unit": "sentences" + }, + { + "Name": "Kuwait", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Volume": "627", + "Unit": "sentences" + }, + { + "Name": "Lebanon", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "929", + "Unit": "sentences" + }, + { + "Name": "Libya", + "Dialect": "ar-LY: (Arabic (Libya))", + "Volume": "1,883", + "Unit": "sentences" + }, + { + "Name": "Mauritania", + "Dialect": "ar-MR: (Arabic (Mauritania))", + "Volume": "314", + "Unit": "sentences" + }, + { + "Name": "Morocco", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "1,256", + "Unit": "sentences" + }, + { + "Name": "Oman", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "2,175", + "Unit": "sentences" + }, + { + "Name": "Palestine", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "626", + "Unit": "sentences" + }, + { + "Name": "Qatar ", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "314", + "Unit": "sentences" + }, + { + "Name": "Saudi Arabia", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "3,130", + "Unit": "sentences" + }, + { + "Name": "Somalia", + "Dialect": "ar-SO: (Arabic (Somalia))", + "Volume": "511", + "Unit": "sentences" + }, + { + "Name": "Sudan", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "310", + "Unit": "sentences" + }, + { + "Name": "Syria", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "1,881", + "Unit": "sentences" + }, + { + "Name": "Tunisia", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "1,190", + "Unit": "sentences" + }, + { + "Name": "UAE", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "940", + "Unit": "sentences" + }, + { + "Name": "Yemen", + "Dialect": "ar-YE: (Arabic (Yemen))", + "Volume": "612", + "Unit": "sentences" + } + ], + "HF Link": "nan", + "Link": "https://sites.google.com/view/nadi-shared-task", + "License": "CC BY-NC-ND 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The shared task dataset covers a total of 100 provinces from 21 Arab countries, collected from the Twitter domain. ", + "Volume": "310,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "NADI 2021:\r\nThe Second Nuanced Arabic Dialect Identification Shared Task\r", + "Paper Link": "https://arxiv.org/pdf/2103.08466.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "dialect identification", + "Venue Title": "WANLP", + "Citations": "12.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Muhammad Abdul-Mageed,Chiyu Zhang,AbdelRahim Elmadany,Houda Bouamor,Nizar Habash", + "Affiliations": ",The University of British Columbia,University of British Columbia,,", + "Abstract": "We present the findings and results of theSecond Nuanced Arabic Dialect IdentificationShared Task (NADI 2021). This Shared Taskincludes four subtasks: country-level ModernStandard Arabic (MSA) identification (Subtask1.1), country-level dialect identification (Subtask1.2), province-level MSA identification (Subtask2.1), and province-level sub-dialect identifica-tion (Subtask 2.2). The shared task dataset cov-ers a total of 100 provinces from 21 Arab coun-tries, collected from the Twitter domain. A totalof 53 teams from 23 countries registered to par-ticipate in the tasks, thus reflecting the interestof the community in this area. We received 16submissions for Subtask 1.1 from five teams, 27submissions for Subtask 1.2 from eight teams,12 submissions for Subtask 2.1 from four teams,and 13 Submissions for subtask 2.2 from fourteams.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/nadia.json b/datasets/nadia.json new file mode 100644 index 0000000..8b5a8bd --- /dev/null +++ b/datasets/nadia.json @@ -0,0 +1,36 @@ +{ + "Name": "NADiA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/NADiA", + "Link": "https://data.mendeley.com/datasets/hhrb7phdyx/1", + "License": "CC BY 4.0", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "NADiA Dataset is the largest, to the best of our knowledge, source for Arabic textual data that can be used in any NLP related task such as text classification. We chose the abbreviation NADiA as it is a common Arabic name. The data was collected by scraping \u2018SkyNewsArabia\u2019 and \u2018Masrawy\u2019 news websites using Python scripts that are fine-tuned for each website. SkyNewsArabia will be referred to as NADiA1, while the latter would be NADiA2. NADiA1 is a big dataset containing 37,445 files, while NADiA2 is a huge dataset that contains 678,563 files. However, after filtering and cleaning we reduced the numbers to 35,416 and 451,230 for NADiA 1 and 2, respectively.", + "Volume": "486,646", + "Unit": "documents", + "Ethical Risks": "nan", + "Provider": "University of Sharjah", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "multi-label text categorization", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Al-Debsi Ridhwan,Elnagar Ashraf,Einea Omar", + "Affiliations": "University of Sharjah", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/nafis__normalized_arabic_fragments_for_inestimable_stemming.json b/datasets/nafis__normalized_arabic_fragments_for_inestimable_stemming.json new file mode 100644 index 0000000..5e8df9f --- /dev/null +++ b/datasets/nafis__normalized_arabic_fragments_for_inestimable_stemming.json @@ -0,0 +1,36 @@ +{ + "Name": "NAFIS: Normalized Arabic Fragments for Inestimable Stemming", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.elra.info/en-us/repository/browse/ELRA-W0127/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The corpus has the following characteristics: \u2022 37 sentences \u2022 The average length of sentences is 5,05 words, with the longest being 10 words \u2022 Declarative, interrogative, imperative and exclamatory sentences accounted for 37,84%, 32,43%, 16,22% and 13,51% respectively \u2022 154 tokens with 5,95 solutions as an average number of stemming solutions", + "Volume": "154", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "ELRA", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "stemming", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json b/datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json new file mode 100644 index 0000000..11858ba --- /dev/null +++ b/datasets/naim_mhedhbi_tunisian_dialect_corpus_v0.json @@ -0,0 +1,36 @@ +{ + "Name": "Naim Mhedhbi Tunisian Dialect Corpus v0", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Tunisian_Dialect_Corpus", + "Link": "https://www.kaggle.com/datasets/naim99/ts-naim-mhedhbi", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "I have collected more than 40000 comments and posts from facebook. I labeled about 10000 comments (positive / negative/ neutral) You can use it for your researches ! Enjoy :)", + "Volume": "40,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Naim Mhedhbi", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/named_entities_lexicon.json b/datasets/named_entities_lexicon.json new file mode 100644 index 0000000..e9c6c09 --- /dev/null +++ b/datasets/named_entities_lexicon.json @@ -0,0 +1,36 @@ +{ + "Name": "Named Entities Lexicon", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Named_Entities_Lexicon", + "Link": "https://github.com/Hkiri-Emna/Named_Entities_Lexicon_Project", + "License": "unknown", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Named entities (person,organisation and location) Arabic-English Pairs Person 27480 Organization 17237 Location 4036 Overall Arabic-English Pairs 48753", + "Volume": "48,753", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Latice Laboratory", + "Derived From": "nan", + "Paper Title": "Arabic-English Text Translation Leveraging Hybrid NER", + "Paper Link": "https://aclanthology.org/Y17-1019.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition,machine translation", + "Venue Title": "PACLIC", + "Citations": "6.0", + "Venue Type": "conference", + "Venue Name": "Pacific Asia Conference on Language, Information and Computation", + "Authors": "Emna Hkiri,S. Mallat,M. Zrigui", + "Affiliations": ",,", + "Abstract": "Named Entities (NEs) are a very important part of a sentence and their treatment is a potentially useful preprocessing step for Statistical Machine Translation (SMT). Improper translation of NE lapse the quality of the SMT output and it can hurt sentence\u2019s human readability considerably. Dropping NE often causes translation failures beyond the context, affecting both the morphosyntactic formedness of sentences and the word sense disambiguation in the source text. Due to peculiarities of the written Arabic language, the translation task is however rather challenging. In this work, we address the challenging issues of NEs treatment in the context of SMT of Arabic into English texts. We have experimented on three types of named entities which are: Proper names, Organization names and Location names. In this paper, we present integration between machine learning and rule based techniques to tackle Arabic NER problem in attempt to improve the final quality of the SMT system output. We show empirically that each aspect of our approach is important, and that their combination leads to the best results already after integration of NER into SMT. We show improvements in terms of BLEU scores (+4 points) and reduction of out of vocabulary words over a baseline for the News Commentary corpus.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/names_transliteration.json b/datasets/names_transliteration.json new file mode 100644 index 0000000..fc10dee --- /dev/null +++ b/datasets/names_transliteration.json @@ -0,0 +1,36 @@ +{ + "Name": "Names transliteration", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/names_transliteration", + "Link": "https://github.com/thomas-chauvet/names_transliteration", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "containing names in arabic characters and associated names in latin characters (english)", + "Volume": "118,047", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "ANETAC dataset, Google transliteration data, NETransliteration-COLING2018", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/narabizi_corpus.json b/datasets/narabizi_corpus.json new file mode 100644 index 0000000..b1ca21d --- /dev/null +++ b/datasets/narabizi_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "NArabizi corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/NArabizi", + "Link": "https://github.com/SamiaTouileb/Narabizi", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "extension of NArabizi treebank by adding to annotations.", + "Volume": "1,500", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Oslo", + "Derived From": "NArabizi treebank", + "Paper Title": "The interplay between language similarity and script\r\non a novel multi-layer Algerian dialect corpus", + "Paper Link": "https://arxiv.org/pdf/2105.07400.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "part of speech tagging,dependency parsing,machine translation, sentiment analysis,transliteration, topic classification", + "Venue Title": "FINDINGS", + "Citations": "1.0", + "Venue Type": "conference", + "Venue Name": "Findings of the Association for Computational Linguistics", + "Authors": "Samia Touileb,Jeremy Barnes", + "Affiliations": ",University of the Basque Country UPV/EHU", + "Abstract": "Recent years have seen a rise in interest for cross-lingual transfer between languages with similar typology, and between languages of various scripts. However, the interplay between language similarity and difference in script on cross-lingual transfer is a less studied problem. We explore this interplay on cross-lingual transfer for two supervised tasks, namely part-of-speech tagging and sentiment analysis. We introduce a newly annotated corpus of Algerian user-generated comments comprising parallel annotations of Algerian written in Latin, Arabic, and code-switched scripts, as well as annotations for sentiment and topic categories. We perform baseline experiments by fine-tuning multi-lingual language models. We further explore the effect of script vs. language similarity in cross-lingual transfer by fine-tuning multi-lingual models on languages which are a) typologically distinct, but use the same script, b) typologically similar, but use a distinct script, or c) are typologically similar and use the same script. We find there is a delicate relationship between script and typology for part-of-speech, while sentiment analysis is less sensitive.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/narabizi_treebank.json b/datasets/narabizi_treebank.json new file mode 100644 index 0000000..418c31f --- /dev/null +++ b/datasets/narabizi_treebank.json @@ -0,0 +1,36 @@ +{ + "Name": "NArabizi treebank", + "Subsets": [], + "HF Link": "nan", + "Link": "https://parsiti.github.io/NArabizi/", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " fully annotated in morpho-syntax\r\nand Universal Dependency syntax, with full\r\ntranslation at both the word and the sentence\r\nlevels", + "Volume": "1,500", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Inria,Sorbonne Universit\u00e9", + "Derived From": "nan", + "Paper Title": "Building a User-Generated Content North-African\nArabizi Treebank: Tackling Hell\n", + "Paper Link": "https://aclanthology.org/2020.acl-main.107.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "part of speech tagging,dependency parsing,machine translation", + "Venue Title": "ACL", + "Citations": "14.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Djam\u00e9 Seddah,Farah Essaidi,Amal Fethi,Matthieu Futeral,Benjamin M\u00fcller,Pedro Javier Ortiz Su\u00e1rez,B. Sagot,Abhishek Srivastava", + "Affiliations": "Inria;Sorbonne Universit\u00e9,,,,,Inria;Sorbonne Universit\u00e9,,", + "Abstract": "We introduce the first treebank for a romanized user-generated content variety of Algerian, a North-African Arabic dialect known for its frequent usage of code-switching. Made of 1500 sentences, fully annotated in morpho-syntax and Universal Dependency syntax, with full translation at both the word and the sentence levels, this treebank is made freely available. It is supplemented with 50k unlabeled sentences collected from Common Crawl and web-crawled data using intensive data-mining techniques. Preliminary experiments demonstrate its usefulness for POS tagging and dependency parsing. We believe that what we present in this paper is useful beyond the low-resource language community. This is the first time that enough unlabeled and annotated data is provided for an emerging user-generated content dialectal language with rich morphology and code switching, making it an challenging test-bed for most recent NLP approaches.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ne3l__named_entities_arabic_corpus.json b/datasets/ne3l__named_entities_arabic_corpus.json new file mode 100644 index 0000000..7d72118 --- /dev/null +++ b/datasets/ne3l__named_entities_arabic_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "NE3L: named entities Arabic corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0078/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The NE3L project (Named Entities 3 Languages) consisted in annotating several corpora with different languages with named entities. Text format data were extracted from newspapers and deal with various topics. 3 different languages were annotated: Arabic, Chinese and Russian.", + "Volume": "103,363", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "5,000.00\u20ac", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/negation_and_speculation_in_arabic_review_(nsar).json b/datasets/negation_and_speculation_in_arabic_review_(nsar).json new file mode 100644 index 0000000..1106e97 --- /dev/null +++ b/datasets/negation_and_speculation_in_arabic_review_(nsar).json @@ -0,0 +1,36 @@ +{ + "Name": "Negation and Speculation in Arabic Review (NSAR)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/amahany/NSAR", + "License": "unknown", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "reviews", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The Negation and Speculation Arabic Review (NSAR) corpus consists of 3K randomly selected review sentences from three well-known and benchmarked Arabic corpora: Large Scale Arabic Book Review (LABR), Large Arabic Multi-domain Resources (LAMR), and Multi-domain Arabic Sentiment Corpus (MASC). It contains reviews from different categories, including books, hotels, restaurants, and other products written in various Arabic dialects. The negation and speculation keywords have been annotated along with their linguistic scope based on the annotation guidelines reviewed by an expert linguist.", + "Volume": "3,011", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Ain Shams University", + "Derived From": "Large Scale Arabic Book Review (LABR), Large Arabic Multi-domain Resources (LAMR), and Multi-domain Arabic Sentiment Corpus (MASC)", + "Paper Title": "Annotated Corpus with Negation and Speculation in Arabic Review Domain: NSAR", + "Paper Link": "https://github.com/amahany/NSAR/blob/main/NSAR_Paper.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, information retrieval, review classification, Negation and Speculation Detection", + "Venue Title": "IJACSA", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "International Journal of Advanced Computer Science and Applications", + "Authors": "Ahmed Mahany, Heba Khaled, Nouh Sabri Elmitwally, Naif Aljohani, Said Ghoniemy", + "Affiliations": "nan", + "Abstract": "Negation and speculation detection are critical for\nNatural Language Processing (NLP) tasks, such as sentiment\nanalysis, information retrieval, and machine translation. This\npaper presents the first Arabic corpus in the review domain\nannotated with negation and speculation. The Negation and\nSpeculation Arabic Review (NSAR) corpus consists of 3K\nrandomly selected review sentences from three well-known and\nbenchmarked Arabic corpora. It contains reviews from different\ncategories, including books, hotels, restaurants, and other\nproducts written in various Arabic dialects. The negation and\nspeculation keywords have been annotated along with their\nlinguistic scope based on the annotation guidelines reviewed by\nan expert linguist. The inter-annotator agreement between two\nindependent annotators, Arabic native speakers, is measured\nusing the Cohen\u2019s Kappa coefficients with values of 95 and 80 for\nnegation and speculation, respectively. Furthermore, 29% of this\ncorpus includes at least one negation instance, while only 4% of\nthis corpus contains speculative content. Therefore, the Arabic\nreviews focus more on negation structures rather than\nspeculation. This corpus will be available for the Arabic research\ncommunity to handle these critical phenomena.", + "Added By": "Ahmed Mahany" +} \ No newline at end of file diff --git a/datasets/nemlar__broadcast_news_speech_corpus.json b/datasets/nemlar__broadcast_news_speech_corpus.json new file mode 100644 index 0000000..262c366 --- /dev/null +++ b/datasets/nemlar__broadcast_news_speech_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "NEMLAR: Broadcast News Speech Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0219/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "he Nemlar Broadcast News Speech Corpus consists of about 40 hours of Standard Arabic news broadcasts. The broadcasts were recorded from four different radio stations: Medi1, Radio Orient, RMC \u2013 Radio Monte Carlo, RTM \u2013 Radio Television Maroc. Each broadcast contains between 25 and 30 minutes of news and interviews (259 distinct speakers identified). The recordings were carried out at three different periods between 30 June 2002 and 18 July 2005. All files were recorded in linear PCM format, 16 kHz, 16 bit.", + "Volume": "40", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nemlar__speech_synthesis_corpus.json b/datasets/nemlar__speech_synthesis_corpus.json new file mode 100644 index 0000000..95c1f8a --- /dev/null +++ b/datasets/nemlar__speech_synthesis_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "NEMLAR: Speech Synthesis Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0220/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The NEMLAR Speech Synthesis Corpus contains the recordings of 2 native Egyptian Arabic speakers (male and female, 35 and 27 years old respectively) recorded in a studio over 2 channels (voice + laryngograph). The recordings comprise more than 10 hours of data with transcriptions.", + "Volume": "10", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "1,000.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nemlar__written_corpus.json b/datasets/nemlar__written_corpus.json new file mode 100644 index 0000000..62aba84 --- /dev/null +++ b/datasets/nemlar__written_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "NEMLAR: Written Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0042/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The NEMLAR Written Corpus consists of about 500,000 words of Arabic text from 13 different categories, aiming to achieve a well-balanced corpus that offers a representation of the variety in syntactic, semantic and pragmatic features of modern Arabic language.", + "Volume": "500,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "lexicon analysis, part of speech tagging", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nemlar_written_corpus.json b/datasets/nemlar_written_corpus.json new file mode 100644 index 0000000..a871883 --- /dev/null +++ b/datasets/nemlar_written_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "NEMLAR Written Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0042/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The NEMLAR Written Corpus consists of about 500,000 words of Arabic text from 13 different categories, aiming to achieve a well-balanced corpus that offers a representation of the variety in syntactic, semantic and pragmatic features of modern Arabic language", + "Volume": "500,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/netdc_arabic_bnsc__broadcast_news_speech_corpus.json b/datasets/netdc_arabic_bnsc__broadcast_news_speech_corpus.json new file mode 100644 index 0000000..c5be0c8 --- /dev/null +++ b/datasets/netdc_arabic_bnsc__broadcast_news_speech_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "NetDC Arabic BNSC: Broadcast News Speech Corpus ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0157/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2007, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The database contains ca. 22.5 hours of broadcast news speech recorded from Radio Orient (France) during a 3-month period between November 2001 and January 2002 (37 broadcast news, including 32 from the 5.55 pm news and 5 from the 10.55 pm news, with about 90 distinct speakers identified)", + "Volume": "22.5", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "ELDA", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "200.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/netransliteration.json b/datasets/netransliteration.json new file mode 100644 index 0000000..e1de108 --- /dev/null +++ b/datasets/netransliteration.json @@ -0,0 +1,36 @@ +{ + "Name": "NETransliteration", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/NETransliteration", + "Link": "https://github.com/steveash/NETransliteration-COLING2018", + "License": "MIT License", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "data files mined from wikidata", + "Volume": "145,186", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Amazon", + "Derived From": "nan", + "Paper Title": "Design Challenges in Named Entity Transliteration", + "Paper Link": "https://aclanthology.org/C18-1053.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "transliteration, named entity recognition ", + "Venue Title": "COLING", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "International Conference on Computational Linguistics", + "Authors": "Yuval Merhav, Stephen Ash", + "Affiliations": "Amazon Alexa AI; Amazon AWS AI", + "Abstract": "We analyze some of the fundamental design challenges that impact the development of a multilingual state-of-the-art named entity transliteration system, including curating bilingual named entity datasets and evaluation of multiple transliteration methods. We empirically evaluate the transliteration task using the traditional weighted finite state transducer (WFST) approach against two neural approaches: the encoder-decoder recurrent neural network method and the recent, non-sequential Transformer method. In order to improve availability of bilingual named entity transliteration datasets, we release personal name bilingual dictionaries mined from Wikidata for English to Russian, Hebrew, Arabic, and Japanese Katakana. Our code and dictionaries are publicly available.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/news_commentary.json b/datasets/news_commentary.json new file mode 100644 index 0000000..cc484aa --- /dev/null +++ b/datasets/news_commentary.json @@ -0,0 +1,36 @@ +{ + "Name": "News Commentary", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/news_commentary", + "Link": "https://opus.nlpl.eu/News-Commentary.php", + "License": "unknown", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "A parallel corpus of News Commentaries provided by WMT for training SMT", + "Volume": "200,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "WMT 19", + "Paper Title": "Parallel Data, Tools and Interfaces in OPUS", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Jorg Tiedemann", + "Affiliations": "Department of Linguistics and Philology Uppsala University, Uppsala/Sweden", + "Abstract": "This paper presents the current status of OPUS, a growing language resource of parallel corpora and related tools. The focus in OPUS\nis to provide freely available data sets in various formats together with basic annotation to be useful for applications in computational\nlinguistics, translation studies and cross-linguistic corpus studies. In this paper, we report about new data sets and their features,\nadditional annotation tools and models provided from the website and essential interfaces and on-line services included in the project.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/newstent.json b/datasets/newstent.json new file mode 100644 index 0000000..e1d6d06 --- /dev/null +++ b/datasets/newstent.json @@ -0,0 +1,36 @@ +{ + "Name": "NewsTent", + "Subsets": [], + "HF Link": "nan", + "Link": "https://drive.google.com/drive/folders/1lVzaSmJWu63YFrJTszQxMqfVoAYuz28F", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "NewsTent extracts 8,443,484 articles and their summaries from 22 newspapers of 19 Arab countries\r\ndated from 1999 to 2019", + "Volume": "8,443,484", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "NewsTent: Arabic Text Summarization Dataset of 8\r\nMillion Arabic News Articles with Summaries\r", + "Paper Link": "https://openreview.net/pdf?id=Sbf9j9WcAkk", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "summarization", + "Venue Title": "other", + "Citations": "0.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "There has been intensive research about text summarization in many languages.\r\nSome languages have reached advanced stages, but Arabic Text Summarization\r\n(ATS) is still in its developing stages due to several limitations starting from\r\ndatasets and ending at the evaluation process. Existing ATS datasets usually\r\ncontain a range of 100 to 30,000 training samples which make them too small\r\nto train a summarization model. Further, existing datasets lack diversity; they\r\nrely on one source (e.g. newspaper) or a few to collect the summaries, also some\r\nrely on a single Arab country and ignore the rest. We present, NewsTent, the\r\nlargest and most diverse dataset for Arabic Text Summarization tasks. NewsTent\r\novercomes the limitations of existing datasets. First, eight million articles and their\r\nsummaries are collected by the \"inverted pyramid\" writing style. The summaries\r\nare verified by human experts and achieved 87.4% accuracy from 1000 samples.\r\nSecond, NewsTent collected news articles from popular newspapers sources of 19\r\nArab countries with an eclectic mix of topics varying from finance, politics, sports,\r\netc to establish comprehensive diversity. Then, we corroborate the superiority\r\nof the extracted summaries by comparing them to MT5-generated summaries.\r\nNewsTent\u2019s large and diverse corpus overcomes the limitations of ATS datasets to\r\nenhance the ATS field.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nileulex.json b/datasets/nileulex.json new file mode 100644 index 0000000..7be21a4 --- /dev/null +++ b/datasets/nileulex.json @@ -0,0 +1,36 @@ +{ + "Name": "NileULex", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/NileULex", + "Link": "https://github.com/NileTMRG/NileULex", + "License": "custom", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Egyptian Arabic and Modern Standard Arabic sentiment words and their polarity", + "Volume": "5,953", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Nile University", + "Derived From": "nan", + "Paper Title": "NileULex: A Phrase and Word Level Sentiment Lexicon for Egyptian and\nModern Standard Arabic ", + "Paper Link": "https://aclanthology.org/L16-1463.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "LREC", + "Citations": "39.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "S. El-Beltagy", + "Affiliations": "nan", + "Abstract": "This paper presents NileULex, which is an Arabic sentiment lexicon containing close to six thousands Arabic words and compound phrases. Forty five percent of the terms and expressions in the lexicon are Egyptian or colloquial while fifty five percent are Modern Standard Arabic. While the collection of many of the terms included in the lexicon was done automatically, the actual addition of any term was done manually. One of the important criterions for adding terms to the lexicon, was that they be as unambiguous as possible. The result is a lexicon with a much higher quality than any translated variant or automatically constructed one. To demonstrate that a lexicon such as this can directly impact the task of sentiment analysis, a very basic machine learning based sentiment analyser that uses unigrams, bigrams, and lexicon based features was applied on two different Twitter datasets. The obtained results were compared to a baseline system that only uses unigrams and bigrams. The same lexicon based features were also generated using a publicly available translation of a popular sentiment lexicon. The experiments show that usage of the developed lexicon improves the results over both the baseline and the publicly available lexicon.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/nist_2002_open_machine_translation_(openmt)_evaluation.json b/datasets/nist_2002_open_machine_translation_(openmt)_evaluation.json new file mode 100644 index 0000000..fd7ccfd --- /dev/null +++ b/datasets/nist_2002_open_machine_translation_(openmt)_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2002 Open Machine Translation (OpenMT) Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T10", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This evaluation kit includes a single perl script (mteval-v09.pl) that may be used to produce a translation quality score for one (or more) MT systems. The script works by comparing the system output translation with a set of (expert) reference translations of the same source text. Comparison is based on finding sequences of words in the reference translations that match word sequences in the system output translation. More information on the evaluation algorithm may be obtained from the paper detailing the algorithm: BLEU: a Method for Automatic Evaluation of Machine Translation (Papineni et al, 2002).", + "Volume": "nan", + "Unit": "nan", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2003_open_machine_translation_(openmt)_evaluation.json b/datasets/nist_2003_open_machine_translation_(openmt)_evaluation.json new file mode 100644 index 0000000..96371c6 --- /dev/null +++ b/datasets/nist_2003_open_machine_translation_(openmt)_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2003 Open Machine Translation (OpenMT) Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T11", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This evaluation kit includes a single perl script (mteval-v09c.pl) that may be used to produce a translation quality score for one (or more) MT systems. The script works by comparing the system output translation with a set of (expert) reference translations of the same source text. Comparison is based on finding sequences of words in the reference translations that match word sequences in the system output translation. More information on the evaluation algorithm may be obtained from the paper detailing the algorithm: BLEU: a Method for Automatic Evaluation of Machine Translation (Papineni et al, 2002).", + "Volume": "nan", + "Unit": "nan", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2004_open_machine_translation_(openmt)_evaluation.json b/datasets/nist_2004_open_machine_translation_(openmt)_evaluation.json new file mode 100644 index 0000000..6883e85 --- /dev/null +++ b/datasets/nist_2004_open_machine_translation_(openmt)_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2004 Open Machine Translation (OpenMT) Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T12", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This evaluation kit includes a single Perl script (mteval-v11a.pl) that may be used to produce a translation quality score for one (or more) MT systems. The script works by comparing the system output translation with a set of (expert) reference translations of the same source text. Comparison is based on finding sequences of words in the reference translations that match word sequences in the system output translation. More information on the evaluation algorithm may be obtained from the paper detailing the algorithm: BLEU: a Method for Automatic Evaluation of Machine Translation (Papineni et al, 2002).", + "Volume": "150", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2005_open_machine_translation_(openmt)_evaluation.json b/datasets/nist_2005_open_machine_translation_(openmt)_evaluation.json new file mode 100644 index 0000000..25ec2f2 --- /dev/null +++ b/datasets/nist_2005_open_machine_translation_(openmt)_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2005 Open Machine Translation (OpenMT) Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T14", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This evaluation kit includes a single Perl script (mteval-v11b.pl) that may be used to produce a translation quality score for one (or more) MT systems. The script works by comparing the system output translation with a set of (expert) reference translations of the same source text. Comparison is based on finding sequences of words in the reference translations that match word sequences in the system output translation. More information on the evaluation algorithm may be obtained from the paper detailing the algorithm: BLEU: a Method for Automatic Evaluation of Machine Translation (Papineni et al, 2002).", + "Volume": "100", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2006_open_machine_translation_(openmt)_evaluation.json b/datasets/nist_2006_open_machine_translation_(openmt)_evaluation.json new file mode 100644 index 0000000..4fcdb78 --- /dev/null +++ b/datasets/nist_2006_open_machine_translation_(openmt)_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2006 Open Machine Translation (OpenMT) Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T17", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This evaluation kit includes a single Perl script (mteval-v11b.pl) that may be used to produce a translation quality score for one (or more) MT systems. The script works by comparing the system output translation with a set of (expert) reference translations of the same source text. Comparison is based on finding sequences of words in the reference translations that match word sequences in the system output translation. More information on the evaluation algorithm may be obtained from the paper detailing the algorithm: BLEU: a Method for Automatic Evaluation of Machine Translation (Papineni et al, 2002).", + "Volume": "357", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2008-2012_open_machine_translation_(openmt)_progress_test_sets.json b/datasets/nist_2008-2012_open_machine_translation_(openmt)_progress_test_sets.json new file mode 100644 index 0000000..d9f0352 --- /dev/null +++ b/datasets/nist_2008-2012_open_machine_translation_(openmt)_progress_test_sets.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2008-2012 Open Machine Translation (OpenMT) Progress Test Sets", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T07", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "This release contains 2,748 documents with corresponding source and reference files, the latter of which contains four independent human reference translations of the source data. The source data is comprised of Arabic and Chinese newswire and web data collected by LDC in 2007. The table below displays statistics by source, genre, documents, segments and source tokens.", + "Volume": "2,748", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2008_open_machine_translation_(openmt)_evaluation.json b/datasets/nist_2008_open_machine_translation_(openmt)_evaluation.json new file mode 100644 index 0000000..0830dc0 --- /dev/null +++ b/datasets/nist_2008_open_machine_translation_(openmt)_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2008 Open Machine Translation (OpenMT) Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T21", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This evaluation kit includes a single Perl script (mteval-v11b.pl) that may be used to produce a translation quality score for one (or more) MT systems. The script works by comparing the system output translation with a set of (expert) reference translations of the same source text. Comparison is based on finding sequences of words in the reference translations that match word sequences in the system output translation. More information on the evaluation algorithm may be obtained from the paper detailing the algorithm: BLEU: a Method for Automatic Evaluation of Machine Translation (Papineni et al, 2002).", + "Volume": "373", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2009_open_machine_translation_(openmt)_evaluation.json b/datasets/nist_2009_open_machine_translation_(openmt)_evaluation.json new file mode 100644 index 0000000..63aba51 --- /dev/null +++ b/datasets/nist_2009_open_machine_translation_(openmt)_evaluation.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2009 Open Machine Translation (OpenMT) Evaluation", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T23", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This evaluation kit includes a single Perl script (mteval-v11b.pl) that may be used to produce a translation quality score for one (or more) MT systems. The script works by comparing the system output translation with a set of (expert) reference translations of the same source text. Comparison is based on finding sequences of words in the reference translations that match word sequences in the system output translation. More information on the evaluation algorithm may be obtained from the paper detailing the algorithm: BLEU: a Method for Automatic Evaluation of Machine Translation (Papineni et al, 2002).", + "Volume": "373", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_2012_open_machine_translation_(openmt)_progress_test_five_language_source.json b/datasets/nist_2012_open_machine_translation_(openmt)_progress_test_five_language_source.json new file mode 100644 index 0000000..e4f3bc4 --- /dev/null +++ b/datasets/nist_2012_open_machine_translation_(openmt)_progress_test_five_language_source.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST 2012 Open Machine Translation (OpenMT) Progress Test Five Language Source", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014T02", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of 20 files, four for each of the five languages, presented in XML with an included DTD. The four files are source and reference data from the same source data in the following two styles:", + "Volume": "4", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "150.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nist_open_mt_2008_evaluation_(mt08)_selected_references_and_system_translations.json b/datasets/nist_open_mt_2008_evaluation_(mt08)_selected_references_and_system_translations.json new file mode 100644 index 0000000..3b73829 --- /dev/null +++ b/datasets/nist_open_mt_2008_evaluation_(mt08)_selected_references_and_system_translations.json @@ -0,0 +1,36 @@ +{ + "Name": "NIST Open MT 2008 Evaluation (MT08) Selected References and System Translations", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010T01", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "120 documents with 1312 segments, output from 17 machine translation systems.", + "Volume": "1,312", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "200.00 $", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/nlp_dataset_for_arabic_dialects.json b/datasets/nlp_dataset_for_arabic_dialects.json new file mode 100644 index 0000000..7bc05d4 --- /dev/null +++ b/datasets/nlp_dataset_for_arabic_dialects.json @@ -0,0 +1,67 @@ +{ + "Name": "NLP dataset for Arabic dialects", + "Subsets": [ + { + "Name": "Algeria", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "13,393", + "Unit": "sentences" + }, + { + "Name": "Lebanon", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "14,482", + "Unit": "sentences" + }, + { + "Name": "Morocco", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "9,965", + "Unit": "sentences" + }, + { + "Name": "Tunisia", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "8,044", + "Unit": "sentences" + }, + { + "Name": "Egypt", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "7,519", + "Unit": "sentences" + } + ], + "HF Link": "nan", + "Link": "https://msda.um6p.ma/msda_datasets", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This data was collected from the Twitter social network and consists on +50K\ntwits in five (5) national dialects", + "Volume": "52,210", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "An open access NLP dataset for Arabic dialects :\r\ndata collection, labeling, and model construction", + "Paper Link": "https://arxiv.org/pdf/2102.11000.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification, topic classification, sentiment analysis", + "Venue Title": "ArXiv", + "Citations": "0.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "ElMehdi Boujou,Hamza Chataoui,Abdellah El Mekki,Saad Benjelloun,I. Chairi,Ismail Berrada", + "Affiliations": ",,Mohammed VI Polytechnic University,,,", + "Abstract": "Natural Language Processing (NLP) is today a very active field of research and innovation. Many applications need however big sets of data for supervised learning, suitably labelled for the training purpose. This includes applications for the Arabic language and its national dialects. However, such open access labeled data sets in Arabic and its dialects are lacking in the Data Science ecosystem and this lack can be a burden to innovation and research in this field. In this work, we present an open data set of social data content in several Arabic dialects. This data was collected from the Twitter social network and consists on +50K twits in five (5) national dialects. Furthermore, this data was labeled for several applications, namely dialect detection, topic detection and sentiment analysis. We publish this data as an open access data to encourage innovation and encourage other works in the field of NLP for Arabic dialects and social media. A selection of models were built using this data set and are presented in this paper along with their performances.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/nsurl-2019_shared_task_8.json b/datasets/nsurl-2019_shared_task_8.json new file mode 100644 index 0000000..bbbd511 --- /dev/null +++ b/datasets/nsurl-2019_shared_task_8.json @@ -0,0 +1,36 @@ +{ + "Name": "NSURL-2019 Shared Task 8", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/nsurl_2019_task8_train,https://hf.co/datasets/arbml/nsurl_2019_task8_test", + "Link": "https://ai.mawdoo3.com/nsurl-2019-task8", + "License": "CC BY-NC-SA 4.0", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "This dataset is composed of 12000 question pairs labelled with 1 for semantically similar questions and 0 for semantically different", + "Volume": "15,712", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Mawdoo3", + "Derived From": "nan", + "Paper Title": "NSURL-2019 Shared Task 8: Semantic Question Similarity in Arabic\r", + "Paper Link": "https://aclanthology.org/2019.nsurl-1.1.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "semantic similarity", + "Venue Title": "ArXiv", + "Citations": "2.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Haitham Seelawi,Ahmad Mustafa,H. Al-Bataineh,Wael Farhan,Hussein T. Al-Natsheh", + "Affiliations": ",,,,", + "Abstract": "Question semantic similarity (Q2Q) is a challenging task that is very useful in many NLP applications, such as detecting duplicate questions and question answering systems. In this paper, we present the results and findings of the shared task (Semantic Question Similarity in Arabic). The task was organized as part of the first workshop on NLP Solutions for Under Resourced Languages (NSURL 2019) The goal of the task is to predict whether two questions are semantically similar or not, even if they are phrased differently. A total of 9 teams participated in the task. The datasets created for this task are made publicly available to support further research on Arabic Q2Q.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/oasst2.json b/datasets/oasst2.json new file mode 100644 index 0000000..747d81f --- /dev/null +++ b/datasets/oasst2.json @@ -0,0 +1,36 @@ +{ + "Name": "OASST2", + "Subsets": [], + "HF Link": "https://hf.co/datasets/OpenAssistant/oasst2", + "Link": "https://hf.co/datasets/OpenAssistant/oasst2", + "License": "Apache-2.0", + "Year": 2023, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "This dataset contains message trees. Each message tree has an initial prompt message as the root node, which can have multiple child messages as replies, and these child messages can have multiple replies.", + "Volume": "1,274", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "text generation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/oca__opinion_corpus_for_arabic.json b/datasets/oca__opinion_corpus_for_arabic.json new file mode 100644 index 0000000..fa096ef --- /dev/null +++ b/datasets/oca__opinion_corpus_for_arabic.json @@ -0,0 +1,36 @@ +{ + "Name": "OCA: Opinion corpus for Arabic", + "Subsets": [], + "HF Link": "nan", + "Link": "http://150.214.174.171:8059/investigacion/recursos/oca-corpus", + "License": "unknown", + "Year": 2011, + "Language": "ar", + "Dialect": "mixed", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The corpus contains 500 movie reviews collected from different web pages and blogs in Arabic, 250 of them considered as positive reviews, and the other 250 as negative opinions", + "Volume": "500", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "OCA: Opinion corpus for Arabic\n", + "Paper Link": "https://onlinelibrary.wiley.com/doi/full/10.1002/asi.21598", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "JASIST", + "Citations": "242.0", + "Venue Type": "journal", + "Venue Name": "The Journal of the Association for Information Science and Technology", + "Authors": "Mohammed Rushdi-Saleh,M. T. Mart\u00edn-Valdivia,L. A. U. L\u00f3pez,Jos\u00e9 Manuel Perea Ortega", + "Affiliations": ",,,", + "Abstract": "Sentiment analysis is a challenging new task related to text mining and natural language processing. Although there are, at present, several studies related to this theme, most of these focus mainly on English texts. The resources available for opinion mining (OM) in other languages are still limited. In this article, we present a new Arabic corpus for the OM task that has been made available to the scientific community for research purposes. The corpus contains 500 movie reviews collected from different web pages and blogs in Arabic, 250 of them considered as positive reviews, and the other 250 as negative opinions. Furthermore, different experiments have been carried out on this corpus, using machine learning algorithms such as support vector machines and Nave Bayes. The results obtained are very promising and we are encouraged to continue this line of research. \u00a9 2011 Wiley Periodicals, Inc.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/oclar.json b/datasets/oclar.json new file mode 100644 index 0000000..3c43ce6 --- /dev/null +++ b/datasets/oclar.json @@ -0,0 +1,36 @@ +{ + "Name": "OCLAR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/community-datasets/oclar", + "Link": "http://archive.ics.uci.edu/ml/datasets/Opinion+Corpus+for+Lebanese+Arabic+Reviews+%28OCLAR%29#", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Opinion Corpus for Lebanese Arabic Reviews ", + "Volume": "3,916", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Lebanese University", + "Derived From": "nan", + "Paper Title": "Sentiment Classifier: Logistic Regression for Arabic Services\u2019 Reviews in Lebanon", + "Paper Link": "https://ieeexplore.ieee.org/abstract/document/8716394/", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, review classification", + "Venue Title": "ICCIS", + "Citations": "8.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer and Information Sciences", + "Authors": "Marwan Al Omari,Moustafa Al-Hajj,N. Hammami,A. Sabra", + "Affiliations": "Universit\u00e9 de Poitiers,,,", + "Abstract": "This paper proposes a logistic regression approach paired with term and inverse document frequency (TF*IDF) for Arabic sentiment classification on services\u2019 reviews in Lebanon country. Reviews are about public services, including hotels, restaurants, shops, and others. We collected manually from Google reviews and Zomato, which have reached to 3916 reviews. Experiments show three core findings: 1) The classifier is confident when used to predict positive reviews. 2) The model is biased on predicting reviews with negative sentiment. Finally, the low percentage of negative reviews in the corpus contributes to the diffidence of logistic regression model.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/offenseval_2020.json b/datasets/offenseval_2020.json new file mode 100644 index 0000000..5e8b5cd --- /dev/null +++ b/datasets/offenseval_2020.json @@ -0,0 +1,36 @@ +{ + "Name": "OffensEval 2020", + "Subsets": [], + "HF Link": "https://hf.co/datasets/strombergnlp/offenseval_2020", + "Link": "https://sites.google.com/site/offensevalsharedtask/results-and-paper-submission", + "License": "CC BY 4.0", + "Year": 2019, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The Arabic dataset consists of 10,000 tweets collected in April\u2013May 2019 using the Twitter API with the language filter set to Arabic: lang:ar.", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Multiple Institutions ", + "Derived From": "Arabic OSACT4", + "Paper Title": "SemEval-2020 Task 12: Multilingual Offensive Language Identification in Social Media (OffensEval 2020)", + "Paper Link": "https://aclanthology.org/2020.semeval-1.188.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "offensive language detection ", + "Venue Title": "SemEval", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Semantic Evaluation", + "Authors": "Marcos Zampieri, Preslav Nakov, Sara Rosenthal, Pepa Atanasova, Georgi Karadzhov, Hamdy Mubarak, Leon Derczynski, Zeses Pitenis, \u00c7a\u011fr\u0131 \u00c7\u00f6ltekin", + "Affiliations": "nan", + "Abstract": "We present the results and the main findings of SemEval-2020 Task 12 on Multilingual Offensive Language Identification in Social Media (OffensEval-2020). The task included three subtasks corresponding to the hierarchical taxonomy of the OLID schema from OffensEval-2019, and it was offered in five languages: Arabic, Danish, English, Greek, and Turkish. OffensEval-2020 was one of the most popular tasks at SemEval-2020, attracting a large number of participants across all subtasks and languages: a total of 528 teams signed up to participate in the task, 145 teams submitted official runs on the test data, and 70 teams submitted system description papers.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/omcca.json b/datasets/omcca.json new file mode 100644 index 0000000..9047da4 --- /dev/null +++ b/datasets/omcca.json @@ -0,0 +1,49 @@ +{ + "Name": "omcca", + "Subsets": [ + { + "Name": "Saudi ", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "18,282", + "Unit": "sentences" + }, + { + "Name": "Jordan", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "10,294", + "Unit": "sentences" + } + ], + "HF Link": "nan", + "Link": "https://github.com/AhmedObaidi/omcca", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Opinion Mining Corpus for Colloquial Variety of Arabic language", + "Volume": "28,576", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Opinion Mining: Analysis of Comments Written\r\nin Arabic Colloquial\r", + "Paper Link": "http://www.iaeng.org/publication/WCECS2016/WCECS2016_pp470-475.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification, sentiment analysis", + "Venue Title": "WCECS", + "Citations": "8.0", + "Venue Type": "conference", + "Venue Name": "World Congress on Engineering and Computer Science", + "Authors": "Ahmed Y. Al-Obaidi,V. Samawi", + "Affiliations": ",", + "Abstract": "In Arab nations, people used to express their\r\nopinions using colloquial dialects depending on the country to\r\nwhich they belong to. Analyzing reviews written in various\r\nArabic dialects is a challenging problem. This is because some\r\nwords could have different meanings in various dialects.\r\nFurthermore, dialects could contain words that do not belong\r\nto classical Arabic language.\u200eThis research tackles the problem\r\nof sentiment analysis of reviews and comments written in\r\ncolloquial dialects of Arabic language, at which the ability of\r\ndifferent machine learning algorithms and features are\r\nexamined in polarity determination. In this work, people's\r\nreviews (written in different dialects) are classified into positive\r\nor negative opinions. Each dialect comes with its own stopwords list. Consequently, a list of stop-words that suits\r\ndifferent dialects in addition to modern standard Arabic (MSA)\r\nis suggested. In this paper, a light stemmer that suits dialects is\r\ndeveloped. Two feature sets are utilized (bag of words (BoW),\r\nand N-gram of words) to investigate their effectiveness in\r\nsentiment analysis. Finally, Na\u00efve-Bayes, Support vector\r\nmachine (SVM), and Maximum Entropy machine learning\r\nalgorithms are applied to study their performance in opinion\r\nmining. F1-measure is used to evaluate the performance of\r\nthese machine learning algorithms. To train and test the\r\nsuggested system performance, we built a corpus1\r\n of reviews by\r\ncollecting reviews written in two dialects (Saudi dialect and\r\nJordanian dialect). The testing results show that Maximum\r\nEntropy outperforms the other two machine learning\r\nalgorithms. Using N-gram (with N=3) as features set improves\r\nthe performance of the three machine learning algorithms. ", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ontonotes_5_0.json b/datasets/ontonotes_5_0.json new file mode 100644 index 0000000..7ce1aa7 --- /dev/null +++ b/datasets/ontonotes_5_0.json @@ -0,0 +1,36 @@ +{ + "Name": "OntoNotes 5.0", + "Subsets": [], + "HF Link": "https://hf.co/datasets/ontonotes/conll2012_ontonotesv5", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T19", + "License": "LDC User Agreement for Non-Members", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The Arabic portion of OntoNotes 5.0 includes 300K words of newswire data. ", + "Volume": "300,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "OntoNotes Release 5.0\nwith OntoNotes DB Tool v0.999 beta ", + "Paper Link": "https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "other", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/ontonotes_release_3_0.json b/datasets/ontonotes_release_3_0.json new file mode 100644 index 0000000..50d0956 --- /dev/null +++ b/datasets/ontonotes_release_3_0.json @@ -0,0 +1,36 @@ +{ + "Name": "OntoNotes Release 3.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T24", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "Each data directory has been stored as a Gnu Zipped Tar File (.tgz) due to the complexity and depth of each directory and the limitations of the ISO CD9660 file system for CD and DVD media. These directories may be easily unpacked using the Unix command line or using utilities such as StuffIt or WinZip under Windows.", + "Volume": "200,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "information extraction,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ontonotes_release_4_0.json b/datasets/ontonotes_release_4_0.json new file mode 100644 index 0000000..0c82784 --- /dev/null +++ b/datasets/ontonotes_release_4_0.json @@ -0,0 +1,36 @@ +{ + "Name": "OntoNotes Release 4.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2011T03", + "License": "LDC User Agreement for Non-Members", + "Year": 2011, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "Documents describing the annotation guidelines and the routines for deriving various views of the data from the database are included in the documentation directory of this release. The annotation is provided both in separate text files for each annotation layer (Treebank, PropBank, word sense, etc.) and in the form of an integrated relational database (ontonotes-v4.0.sql.gz) with a Python API to provide convenient cross-layer access.", + "Volume": "300,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "information retrieval,information extraction", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ontonotes_release_5.json b/datasets/ontonotes_release_5.json new file mode 100644 index 0000000..c69ab38 --- /dev/null +++ b/datasets/ontonotes_release_5.json @@ -0,0 +1,36 @@ +{ + "Name": "OntoNotes Release 5", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2013T19", + "License": "LDC User Agreement for Non-Members", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The goal of the project was to annotate a large corpus comprising various genres of text (news, conversational telephone speech, weblogs, usenet newsgroups, broadcast, talk shows) in three languages (English, Chinese, and Arabic) with structural information (syntax and predicate argument structure) and shallow semantics (word sense linked to an ontology and coreference).", + "Volume": "300,000", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "CoNLL-2012 Shared Task: Modeling Multilingual Unrestricted Coreference in OntoNotes", + "Paper Link": "https://aclanthology.org/W12-4501.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "coreference resolution, word sense disambiguation, named entity recognition", + "Venue Title": "SIGDAT", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "Special Interest Group on Linguistic data and corpus-based approaches to NLP", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/open-domain_response_generation_in_arabic_dialects.json b/datasets/open-domain_response_generation_in_arabic_dialects.json new file mode 100644 index 0000000..717f849 --- /dev/null +++ b/datasets/open-domain_response_generation_in_arabic_dialects.json @@ -0,0 +1,36 @@ +{ + "Name": "Open-Domain Response Generation in Arabic Dialects", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/reponse_generation", + "Link": "https://github.com/tareknaous/dialogue-arabic-dialects", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "1K open-domain dialectal utterance-response pairs, manually translated from an English dataset (DailyDialogue), and adapted to three different Arabic dialects.", + "Volume": "1,000", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Open-Domain Response Generation in Low-Resource Settings using Self-Supervised Pre-Training of Warm-Started Transformers", + "Paper Link": "https://dl.acm.org/doi/full/10.1145/3579164", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, dialect identification, text generation, dialogue systems", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Learning response generation models constitute the main component of building open-domain dialogue systems. However, training open-domain response generation models requires large amounts of labeled data and pre-trained language generation models that are often nonexistent for low-resource languages. In this article, we propose a framework for training open-domain response generation models in low-resource settings. We consider Dialectal Arabic (DA) as a working example. The framework starts by warm-starting a transformer-based encoder-decoder with pre-trained language model parameters. Next, the resultant encoder-decoder model is adapted to DA by employing self-supervised pre-training on large-scale unlabeled data in the desired dialect. Finally, the model is fine-tuned on a very small labeled dataset for open-domain response generation. The results show significant performance improvements on three spoken Arabic dialects after adopting the framework\u2019s three stages, highlighted by higher BLEU and lower Perplexity scores compared with multiple baseline models. Specifically, our models are capable of generating fluent responses in multiple dialects with an average human-evaluated fluency score above 4. Our data is made publicly available.", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/openiti-proc.json b/datasets/openiti-proc.json new file mode 100644 index 0000000..7633c4c --- /dev/null +++ b/datasets/openiti-proc.json @@ -0,0 +1,36 @@ +{ + "Name": "OpenITI-proc", + "Subsets": [], + "HF Link": "nan", + "Link": "https://zenodo.org/record/2535593#.YWh7FS8RozU", + "License": "CC BY 4.0", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A linguistically annotated version of the OpenITI corpus, with annotations for lemmas, POS tags, parse trees, and morphological segmentation", + "Volume": "1,500,000,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "OpenITI", + "Paper Title": "Studying the History of the Arabic Language: Language Technology and a Large-Scale Historical Corpus", + "Paper Link": "https://arxiv.org/pdf/1809.03891.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "LRE", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Language Resources and Evaluation", + "Authors": "Yonatan Belinkov, Alexander Magidow, Alberto Barr\u00f3n-Cede\u00f1o, Avi Shmidman, Maxim Romanov", + "Affiliations": "nan", + "Abstract": "Arabic is a widely-spoken language with a long and rich history, but existing corpora and language technology focus mostly on modern Arabic and its varieties. Therefore, studying the history of the language has so far been mostly limited to manual analyses on a small scale. In this work, we present a large-scale historical corpus of the written Arabic language, spanning 1400 years. We describe our efforts to clean and process this corpus using Arabic NLP tools, including the identification of reused text. We study the history of the Arabic language using a novel automatic periodization algorithm, as well as other techniques. Our findings confirm the established division of written Arabic into Modern Standard and Classical Arabic, and confirm other established periodizations, while suggesting that written Arabic may be divisible into still further periods of development.", + "Added By": "Yonatan Belinkov" +} \ No newline at end of file diff --git a/datasets/opensubtitles.json b/datasets/opensubtitles.json new file mode 100644 index 0000000..0166a66 --- /dev/null +++ b/datasets/opensubtitles.json @@ -0,0 +1,36 @@ +{ + "Name": "OpenSubtitles", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/open_subtitles", + "Link": "https://opus.nlpl.eu/OpenSubtitles.php", + "License": "unknown", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.", + "Volume": "83,600,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2016/pdf/947_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources Evaluation Conference", + "Authors": "Pierre Lison, Jorg Tiedemann", + "Affiliations": "University of Oslo", + "Abstract": "We present a new major release of the OpenSubtitles collection of parallel corpora. The release is compiled from a large database\nof movie and TV subtitles and includes a total of 1689 bitexts spanning 2.6 billion sentences across 60 languages. The release also\nincorporates a number of enhancements in the preprocessing and alignment of the subtitles, such as the automatic correction of OCR\nerrors and the use of meta-data to estimate the quality of each subtitle and score subtitle pairs.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/opus100.json b/datasets/opus100.json new file mode 100644 index 0000000..08e532c --- /dev/null +++ b/datasets/opus100.json @@ -0,0 +1,36 @@ +{ + "Name": "opus100", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/opus-100", + "Link": "https://data.statmt.org/opus-100-corpus/v1.0/", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "OPUS-100 contains approximately 55M sentence pairs. Of the 99 language pairs, 44 have 1M sentence pairs of training data, 73 have at least 100k, and 95 have at least 10k.", + "Volume": "1,040,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Edinburgh", + "Derived From": "nan", + "Paper Title": "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation", + "Paper Link": "https://arxiv.org/pdf/2004.11867.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation ", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Biao Zhang, Philip Williams, Ivan Titov, Rico Sennrich", + "Affiliations": "School of Informatics, University of Edinburgh;School of Informatics, University of EdinburghSchool of Informatics, University of Edinburgh; Department of Computational Linguistics, University of Zurich", + "Abstract": "Massively multilingual models for neural machine translation (NMT) are theoretically attractive, but often underperform bilingual models and deliver poor zero-shot translations.\nIn this paper, we explore ways to improve\nthem. We argue that multilingual NMT requires stronger modeling capacity to support\nlanguage pairs with varying typological characteristics, and overcome this bottleneck via\nlanguage-specific components and deepening\nNMT architectures. We identify the off-target\ntranslation issue (i.e. translating into a wrong\ntarget language) as the major source of the\ninferior zero-shot performance, and propose\nrandom online backtranslation to enforce the\ntranslation of unseen training language pairs.\nExperiments on OPUS-100 (a novel multilingual dataset with 100 languages) show that\nour approach substantially narrows the performance gap with bilingual models in both oneto-many and many-to-many settings, and improves zero-shot performance by \u223c10 BLEU,\napproaching conventional pivot-based methods", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/opus_ubuntu.json b/datasets/opus_ubuntu.json new file mode 100644 index 0000000..09074b5 --- /dev/null +++ b/datasets/opus_ubuntu.json @@ -0,0 +1,36 @@ +{ + "Name": "opus_ubuntu", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/opus_ubuntu", + "Link": "https://hf.co/datasets/opus_ubuntu", + "License": "BSD", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "These are translations of the Ubuntu software package messages, donated by the Ubuntu community.", + "Volume": "299", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "Parallel Data, Tools and Interfaces in OPUS", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "1203.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Jorg Tiedemann", + "Affiliations": "Uppsala University", + "Abstract": "This paper presents the current status of OPUS, a growing language resource of parallel corpora and related tools. The focus in OPUS is to provide freely available data sets in various formats together with basic annotation to be useful for applications in computational linguistics, translation studies and cross-linguistic corpus studies. In this paper, we report about new data sets and their features, additional annotation tools and models provided from the website and essential interfaces and on-line services included in the project.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/opus_wikipedia.json b/datasets/opus_wikipedia.json new file mode 100644 index 0000000..d8d569a --- /dev/null +++ b/datasets/opus_wikipedia.json @@ -0,0 +1,36 @@ +{ + "Name": "OPUS Wikipedia", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/opus_wikipedia", + "Link": "https://opus.nlpl.eu/Wikipedia.php", + "License": "unknown", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is a corpus of parallel sentences extracted from Wikipedia", + "Volume": "1,000,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "Parallel Data, Tools and Interfaces in OPUS", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources Evaluation Conference", + "Authors": "nan", + "Affiliations": "Jorg Tiedemann \u00a8", + "Abstract": "This paper presents the current status of OPUS, a growing language resource of parallel corpora and related tools. The focus in OPUS\nis to provide freely available data sets in various formats together with basic annotation to be useful for applications in computational\nlinguistics, translation studies and cross-linguistic corpus studies. In this paper, we report about new data sets and their features,\nadditional annotation tools and models provided from the website and essential interfaces and on-line services included in the project.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/orientel_egypt_mca__modern_colloquial_arabic_database.json b/datasets/orientel_egypt_mca__modern_colloquial_arabic_database.json new file mode 100644 index 0000000..7bea67c --- /dev/null +++ b/datasets/orientel_egypt_mca__modern_colloquial_arabic_database.json @@ -0,0 +1,36 @@ +{ + "Name": "OrienTel Egypt MCA: Modern Colloquial Arabic database ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0221/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "The OrienTel Egypt MCA (Modern Colloquial Arabic) database comprises 750 Egyptian speakers (398 males, 352 females) recorded over the Egyptian fixed and mobile telephone network.", + "Volume": "18,571", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OrienTel", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "22,500.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/orientel_egypt_msa_(modern_standard_arabic)_database.json b/datasets/orientel_egypt_msa_(modern_standard_arabic)_database.json new file mode 100644 index 0000000..e5eb00a --- /dev/null +++ b/datasets/orientel_egypt_msa_(modern_standard_arabic)_database.json @@ -0,0 +1,36 @@ +{ + "Name": "OrienTel Egypt MSA (Modern Standard Arabic) database", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0222/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2006, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The OrienTel Egypt MSA (Modern Standard Arabic) database comprises 500 Egyptian speakers (254 males, 246 females) recorded over the Egyptian fixed and mobile telephone network. ", + "Volume": "24,500", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "15,000.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/orientel_jordan_msa_(modern_standard_arabic)_database.json b/datasets/orientel_jordan_msa_(modern_standard_arabic)_database.json new file mode 100644 index 0000000..52c4e3b --- /dev/null +++ b/datasets/orientel_jordan_msa_(modern_standard_arabic)_database.json @@ -0,0 +1,36 @@ +{ + "Name": "OrienTel Jordan MSA (Modern Standard Arabic) database", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0290/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2008, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The OrienTel Jordan MSA (Modern Standard Arabic) database comprises 556 Jordanian speakers (288 males, 268 females) recorded over the Jordanian fixed and mobile telephone network.", + "Volume": "28,356", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": " ", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "15,000.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/orientel_tunisia_msa_(modern_standard_arabic)_database.json b/datasets/orientel_tunisia_msa_(modern_standard_arabic)_database.json new file mode 100644 index 0000000..db05612 --- /dev/null +++ b/datasets/orientel_tunisia_msa_(modern_standard_arabic)_database.json @@ -0,0 +1,36 @@ +{ + "Name": "OrienTel Tunisia MSA (Modern Standard Arabic) database", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-S0187/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2005, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The OrienTel Tunisia MSA (Modern Standard Arabic) database comprises 598 Tunisian speakers (359 males, 239 females) recorded over the Tunisian fixed and mobile telephone network.", + "Volume": "31,096", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "6,000.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/osac.json b/datasets/osac.json new file mode 100644 index 0000000..858c114 --- /dev/null +++ b/datasets/osac.json @@ -0,0 +1,36 @@ +{ + "Name": "OSAC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/OSAC_CNN", + "Link": "https://sourceforge.net/projects/ar-text-mining/files/Arabic-Corpora/", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "collecting the largest free\r\naccessible Arabic corpus, OSAC, which contains about\r\n18M words and about 0.5M district keywords.", + "Volume": "22,429", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Islamic University of Gaza", + "Derived From": "nan", + "Paper Title": "OSAC: Open Source Arabic Corpora\r", + "Paper Link": "http://site.iugaza.edu.ps/wp-content/uploads/mksaad-OSAC-OpenSourceArabicCorpora-EECS10-rev9(1).pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "other", + "Citations": "101.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Motaz Saad,W. Ashour", + "Affiliations": "The Islamic University of Gaza,", + "Abstract": "Arabic Linguistics is promising research field. The acute lack of free public accessible Arabic corpora is one of the major difficulties that Arabic linguistics researches face. The effort of this paper is a step towards supporting Arabic linguistics research field. This paper presents the complex nature of Arabic language, pose the problems of: (1) lacking free public Arabic corpora, (2) the lack of high-quality, wellstructured Arabic digital contents. The paper finally presents OSAC, the largest free accessible that we collected.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/oscar-2201.json b/datasets/oscar-2201.json new file mode 100644 index 0000000..4d082ba --- /dev/null +++ b/datasets/oscar-2201.json @@ -0,0 +1,36 @@ +{ + "Name": "OSCAR-2201", + "Subsets": [], + "HF Link": "https://hf.co/datasets/oscar-corpus/OSCAR-2201", + "Link": "https://hf.co/datasets/oscar-corpus/OSCAR-2201", + "License": "CC0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "OSCAR or Open Super-large Crawled Aggregated coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the ungoliant architecture. Data is distributed by language in both original and deduplicated form.", + "Volume": "8,718,929", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "Common Crawl", + "Paper Title": "Towards a Cleaner Document-Oriented Multilingual Crawled Corpus", + "Paper Link": "https://arxiv.org/pdf/2201.06642.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Julien Abadji, Pedro Ortiz Suarez, Laurent Romary, Beno\u0131t Sagot\n", + "Affiliations": "nan", + "Abstract": "The need for raw large raw corpora has dramatically increased in recent years with the introduction of transfer learning and\nsemi-supervised learning methods to Natural Language Processing. And while there have been some recent attempts to\nmanually curate the amount of data necessary to train large language models, the main way to obtain this data is still through\nautomatic web crawling. In this paper we take the existing multilingual web corpus OSCAR and its pipeline Ungoliant that\nextracts and classifies data from Common Crawl at the line level, and propose a set of improvements and automatic annotations\nin order to produce a new document-oriented version of OSCAR that could prove more suitable to pre-train large generative\nlanguage models as well as hopefully other applications in Natural Language Processing and Digital Humanities.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/oscar_small.json b/datasets/oscar_small.json new file mode 100644 index 0000000..91fb4b4 --- /dev/null +++ b/datasets/oscar_small.json @@ -0,0 +1,36 @@ +{ + "Name": "OSCAR Small", + "Subsets": [], + "HF Link": "https://hf.co/datasets/nthngdy/oscar-small", + "Link": "https://hf.co/datasets/nthngdy/oscar-small", + "License": "CC0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "web pages", + "Form": "text", + "Collection Style": "other", + "Description": "a processed version of and smaller subset of OSCAR", + "Volume": "408,438", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "OSCAR", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language modeling, text generation ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/osian.json b/datasets/osian.json new file mode 100644 index 0000000..5a4ab2d --- /dev/null +++ b/datasets/osian.json @@ -0,0 +1,36 @@ +{ + "Name": "OSIAN", + "Subsets": [], + "HF Link": "nan", + "Link": "http://oujda-nlp-team.net/en/corpora/osian-corpus/", + "License": "CC BY-NC 4.0", + "Year": 2019, + "Language": "ar", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "The corpus data was collected from international Arabic news websites,", + "Volume": "3,500,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "OSIAN: Open Source International Arabic News Corpus -\r\nPreparation and Integration into the CLARIN-infrastructure\r", + "Paper Link": "https://aclanthology.org/W19-4619.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "WANLP", + "Citations": "15.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Imad Zeroual,Dirk Goldhahn,Thomas Eckart,A. Lakhouaja", + "Affiliations": ",,,", + "Abstract": "The World Wide Web has become a fundamental resource for building large text corpora. Broadcasting platforms such as news websites are rich sources of data regarding diverse topics and form a valuable foundation for research. The Arabic language is extensively utilized on the Web. Still, Arabic is relatively an under-resourced language in terms of availability of freely annotated corpora. This paper presents the first version of the Open Source International Arabic News (OSIAN) corpus. The corpus data was collected from international Arabic news websites, all being freely available on the Web. The corpus consists of about 3.5 million articles comprising more than 37 million sentences and roughly 1 billion tokens. It is encoded in XML; each article is annotated with metadata information. Moreover, each word is annotated with lemma and part-of-speech. the described corpus is processed, archived and published into the CLARIN infrastructure. This publication includes descriptive metadata via OAI-PMH, direct access to the plain text material (available under Creative Commons Attribution-Non-Commercial 4.0 International License - CC BY-NC 4.0), and integration into the WebLicht annotation platform and CLARIN\u2019s Federated Content Search FCS.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/osman.json b/datasets/osman.json new file mode 100644 index 0000000..d8426ac --- /dev/null +++ b/datasets/osman.json @@ -0,0 +1,36 @@ +{ + "Name": "Osman", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Osman_Un_Corpus", + "Link": "https://github.com/drelhaj/OsmanReadability", + "License": "unknown", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "e text readability for 73,000 parallel sentences from English and Arabic UN documents", + "Volume": "73,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Lancaster University", + "Derived From": "nan", + "Paper Title": "OSMAN \u2013 A Novel Arabic Readability Metric\r", + "Paper Link": "https://aclanthology.org/L16-1038.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "14.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Mahmoud El-Haj,Paul Rayson", + "Affiliations": "Lancaster University,Lancaster University", + "Abstract": "We present OSMAN (Open Source Metric for Measuring Arabic Narratives) - a novel open source Arabic readability metric and tool. It allows researchers to calculate readability for Arabic text with and without diacritics. OSMAN is a modified version of the conventional readability formulas such as Flesch and Fog. In our work we introduce a novel approach towards counting short, long and stress syllables in Arabic which is essential for judging readability of Arabic narratives. We also introduce an additional factor called \u201cFaseeh\u201d which considers aspects of script usually dropped in informal Arabic writing. To evaluate our methods we used Spearman\u2019s correlation metric to compare text readability for 73,000 parallel sentences from English and Arabic UN documents. The Arabic sentences were written with the absence of diacritics and in order to count the number of syllables we added the diacritics in using an open source tool called Mishkal. The results show that OSMAN readability formula correlates well with the English ones making it a useful tool for researchers and educators working with Arabic text.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/osman_un_corpus.json b/datasets/osman_un_corpus.json new file mode 100644 index 0000000..db8522c --- /dev/null +++ b/datasets/osman_un_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "OSMAN UN Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/osman_readability", + "Link": "https://github.com/drelhaj/OsmanReadability", + "License": "unknown", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "commentary", + "Form": "text", + "Collection Style": "other", + "Description": "Arabic readability", + "Volume": "73,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Lancaster University", + "Derived From": "nan", + "Paper Title": "OSMAN \u2013 A Novel Arabic Readability Metric", + "Paper Link": "https://aclanthology.org/L16-1038.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "readability assessment", + "Venue Title": "European Language Resources Association (ELRA)", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "European Language Resources Association (ELRA)", + "Authors": "Mahmoud El-Ha, Paul Rayson", + "Affiliations": "Lancaster University", + "Abstract": "We present OSMAN (Open Source Metric for Measuring Arabic Narratives) - a novel open source Arabic readability metric and tool. It allows researchers to calculate readability for Arabic text with and without diacritics. OSMAN is a modified version of the conventional readability formulas such as Flesch and Fog. In our work we introduce a novel approach towards counting short, long and stress syllables in Arabic which is essential for judging readability of Arabic narratives. We also introduce an additional factor called \u201cFaseeh\u201d which considers aspects of script usually dropped in informal Arabic writing. To evaluate our methods we used Spearman\u2019s correlation metric to compare text readability for 73,000 parallel sentences from English and Arabic UN documents. The Arabic sentences were written with the absence of diacritics and in order to count the number of syllables we added the diacritics in using an open source tool called Mishkal. The results show that OSMAN readability formula correlates well with the English ones making it a useful tool for researchers and educators working with Arabic tex", + "Added By": "Emad A. Alghamdi" +} \ No newline at end of file diff --git a/datasets/paad.json b/datasets/paad.json new file mode 100644 index 0000000..d0e4648 --- /dev/null +++ b/datasets/paad.json @@ -0,0 +1,36 @@ +{ + "Name": "PAAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/PAAD", + "Link": "https://data.mendeley.com/datasets/spvbf5bgjs/2", + "License": "CC BY 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "he dataset is 206 articles distributed into three categories as (Reform, Conservative and Revolutionary) that we offer to the research community on Arabiccomputational linguistics.", + "Volume": "206", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "PAAD:POLITICAL ARABIC ARTICLES DATASET FOR AUTOMATIC TEXT CATEGORIZATION", + "Paper Link": "https://ijci.uoitc.edu.iq/index.php/ijci/article/view/246/174", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "IJCI", + "Citations": "3.0", + "Venue Type": "journal", + "Venue Name": "Iraqi journal of computer and informatics", + "Authors": "Dhafar Hamed Abd,Ahmed T. Sadiq,Ayad R. Abbas", + "Affiliations": ",,", + "Abstract": "Now day\u2019s text Classification and Sentiment analysis is considered as one of the popular Natural Language Processing (NLP) tasks. This kind of technique plays significant role in human activities and has impact on the daily behaviours. Each article in different fields such as politics and business represent different opinions according to the writer tendency. A huge amount of data will be acquired through that differentiation. The capability to manage the political orientation of an online article automatically. Therefore, there is no corpus for political categorization was directed towards this task in Arabic, due to the lack of rich representative resources for training an Arabic text classifier. However, we introduce political Arabic articles dataset (PAAD) of textual data collected from newspapers, social network, general forum and ideology website. The dataset is 206 articles distributed into three categories as (Reform, Conservative and Revolutionary) that we offer to the research community on Arabic computational linguistics. We anticipate that this dataset would make a great aid for a variety of NLP tasks on Modern Standard Arabic, political text classification purposes. We present the data in raw form and excel file. Excel file will be in four types such as V1 raw data, V2 preprocessing, V3 root stemming and V4 light stemming.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/padic.json b/datasets/padic.json new file mode 100644 index 0000000..3fcda93 --- /dev/null +++ b/datasets/padic.json @@ -0,0 +1,73 @@ +{ + "Name": "PADIC", + "Subsets": [ + { + "Name": "ALG", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "6,400", + "Unit": "sentences" + }, + { + "Name": "ANB", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "6,400", + "Unit": "sentences" + }, + { + "Name": "TUN", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "6,400", + "Unit": "sentences" + }, + { + "Name": "SYR", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "6,400", + "Unit": "sentences" + }, + { + "Name": "PAL", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "6,400", + "Unit": "sentences" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "6,400", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/PADIC", + "Link": "https://sourceforge.net/projects/padic/", + "License": "GPL-3.0", + "Year": 2015, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "a Parallel Arabic DIalect Corpus we built from scratch,", + "Volume": "32,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Machine Translation Experiments on PADIC:\r\nA Parallel Arabic DIalect Corpus", + "Paper Link": "https://aclanthology.org/Y15-1004.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "PACLIC", + "Citations": "58.0", + "Venue Type": "conference", + "Venue Name": "Pacific Asia Conference on Language, Information and Computation", + "Authors": "Karima Meftouh,S. Harrat,S. Jamoussi,Mourad Abbas,Kamel Sma\u00efli", + "Affiliations": ",,,,", + "Abstract": "We present in this paper PADIC, a Parallel Arabic DIalect Corpus we built from scratch, then we conducted experiments on crossdialect Arabic machine translation. PADIC is composed of dialects from both the Maghreb and the Middle-East. Each dialect has been aligned with Modern Standard Arabic (MSA). Three dialects from Maghreb are concerned by this study: two from Algeria, one from Tunisia, and two dialects from the MiddleEast (Syria and Palestine). PADIC has been built from scratch because the lack of dialect resources. In fact, Arabic dialects in Arab world in general are used in daily life conversations but they are not written. At the best of our knowledge, PADIC, up to now, is the largest corpus in the community working on dialects and especially those concerning Maghreb. PADIC is composed of 6400 sentences for each of the 5 concerned dialects and MSA. We conducted cross-lingual machine translation experiments between all the language pairs. For translating to MSA we interpolated the corresponding Language Model (LM) with a large Arabic corpus based LM. We also studied the impact of language model smoothing techniques on the results of machine translation because this corpus, even it is the largest one, it still very small in comparison to those used for translation of natural languages.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/padic__parallel_arabic_dialect_corpus.json b/datasets/padic__parallel_arabic_dialect_corpus.json new file mode 100644 index 0000000..b8ec507 --- /dev/null +++ b/datasets/padic__parallel_arabic_dialect_corpus.json @@ -0,0 +1,73 @@ +{ + "Name": "PADIC: Parallel Arabic DIalect Corpus", + "Subsets": [ + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "8,244", + "Unit": "sentences" + }, + { + "Name": "ALG", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "8,244", + "Unit": "sentences" + }, + { + "Name": "ANB", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "8,244", + "Unit": "sentences" + }, + { + "Name": "TUN", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "8,244", + "Unit": "sentences" + }, + { + "Name": "PAL", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "8,244", + "Unit": "sentences" + }, + { + "Name": "SYR", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "8,244", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/PADIC", + "Link": "https://smart.loria.fr/corpora/", + "License": "unknown", + "Year": 2014, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "s composed of about 6400 sentences of dialects from both the Maghreb and the middle east", + "Volume": "12,824", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Loris Fr", + "Derived From": "nan", + "Paper Title": "A multidialectal parallel corpus of Arabic", + "Paper Link": "https://hal.archives-ouvertes.fr/hal-01261587/document", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "82.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Houda Bouamor,Nizar Habash,Kemal Oflazer", + "Affiliations": ",,", + "Abstract": "The daily spoken variety of Arabic is often termed the colloquial or dialect form of Arabic. There are many Arabic dialects across the Arab World and within other Arabic speaking communities. These dialects vary widely from region to region and to a lesser extent from city to city in each region. The dialects are not standardized, they are not taught, and they do not have official status. However they are the primary vehicles of communication (face-to-face and recently, online) and have a large presence in the arts as well. In this paper, we present the first multidialectal Arabic parallel corpus, a collection of 2,000 sentences in Standard Arabic, Egyptian, Tunisian, Jordanian, Palestinian and Syrian Arabic, in addition to English. Such parallel data does not exist naturally, which makes this corpus a very valuable resource that has many potential applications such as Arabic dialect identification and machine translation.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/pan17_author_profiling.json b/datasets/pan17_author_profiling.json new file mode 100644 index 0000000..4b46e68 --- /dev/null +++ b/datasets/pan17_author_profiling.json @@ -0,0 +1,36 @@ +{ + "Name": "PAN17 Author Profiling", + "Subsets": [], + "HF Link": "nan", + "Link": "https://zenodo.org/record/3745980#.YqTxWnZBxD9", + "License": "unknown", + "Year": 2017, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "We provide you with a training data set that consists of Twitter tweets in English, Spanish, Portuguese and Arabic, labeled with gender and language variety.", + "Volume": "4,000", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "Bauhaus-Universit\u00e4t Weimar", + "Derived From": "nan", + "Paper Title": "Overview of PAN\u201917 Author Identification, Author Profiling, and Author Obfuscation", + "Paper Link": "https://riunet.upv.es/bitstream/handle/10251/102943/PAN-2017-author.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "author profiling", + "Venue Title": "CLEF", + "Citations": "85.0", + "Venue Type": "conference", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Nouamane Tazi" +} \ No newline at end of file diff --git a/datasets/pan18_author_profiling.json b/datasets/pan18_author_profiling.json new file mode 100644 index 0000000..c77a8f4 --- /dev/null +++ b/datasets/pan18_author_profiling.json @@ -0,0 +1,36 @@ +{ + "Name": "PAN18 Author Profiling", + "Subsets": [], + "HF Link": "nan", + "Link": "https://zenodo.org/record/3746006#.YptKU3ZBxD8", + "License": "unknown", + "Year": 2018, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "We provide you with a training data set that consists of Twitter users labeled with gender. For each author, a total of 100 tweets and 10 images are provided. Authors are grouped by the language of their tweets: English, Arabic and Spanish.", + "Volume": "250,000", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "Bauhaus-Universit\u00e4t Weimar", + "Derived From": "nan", + "Paper Title": "Overview of the 6th Author Profiling Task at PAN 2018: Multimodal Gender Identification in Twitter", + "Paper Link": "https://pan.webis.de/downloads/publications/papers/rangel_2018.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "author profiling", + "Venue Title": "CLEF", + "Citations": "132.0", + "Venue Type": "conference", + "Venue Name": "Conference and Labs of the Evaluation Forum", + "Authors": "Rangel, Francisco, Rosso, Paolo, Potthast, Martin, & Stein, Benno.", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Nouamane Tazi" +} \ No newline at end of file diff --git a/datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json b/datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json new file mode 100644 index 0000000..676062c --- /dev/null +++ b/datasets/pan_arabic_intrinsic_plagiarism_detection_shared_task_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "PAN Arabic Intrinsic Plagiarism Detection Shared Task Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://zenodo.org/record/6609196#.YqTYvNrMLIV", + "License": "CC BY 4.0", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Each part of the corpus (training and test) consists mainly of 2 datasets: textual files and XML files. The textual files represent the suspicious documents i.e., the documents that contain artificial plagiarism; and the XML files are the plagiarism annotation i.e. they provide for each plagiarized passage its starting offset in the suspicious document and its length (offset and length are both expressed in characters). A suspicious document file and its plagiarism annotation file share the same name.", + "Volume": "2,048", + "Unit": "documents", + "Ethical Risks": "nan", + "Provider": "MISC Lab. Constantine 2 university,PRHLT, Universitat Polit\u00e8cnica de Val\u00e8ncia,", + "Derived From": "nan", + "Paper Title": "A New Corpus for the Evaluation of Arabic Intrinsic Plagiarism Detection.", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-642-40802-1_6", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "plagiarism detection", + "Venue Title": "CLEF 2013", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference of the Cross-Language Evaluation Forum for European Languages", + "Authors": "Imene Bensalem, Paolo Rosso, Salim Chikhi", + "Affiliations": "\u00b9MISC Lab. Constantine 2 university, Algeria PRHLT, Universitat Polit\u00e8cnica de Val\u00e8ncia, Spain ", + "Abstract": "The present paper introduces the first corpus for the evaluation of Arabic intrinsic plagiarism detection. The corpus consists of 1024 artificial suspicious documents in which 2833 plagiarism cases have been inserted automatically from source documents.", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/papluca_language-identification.json b/datasets/papluca_language-identification.json new file mode 100644 index 0000000..d856e5f --- /dev/null +++ b/datasets/papluca_language-identification.json @@ -0,0 +1,36 @@ +{ + "Name": "papluca/language-identification", + "Subsets": [], + "HF Link": "https://hf.co/datasets/papluca/language-identification", + "Link": "https://hf.co/datasets/papluca/language-identification", + "License": "unknown", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "The Language Identification dataset is a collection of 90k samples consisting of text passages and corresponding language label.", + "Volume": "90,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "Multilingual Amazon Reviews Corpus, XNLI, and STSb Multi MT", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language identification", + "Venue Title": "The Hugging Face Course Community Event", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/phonbank_arabic_kuwaiti_corpus.json b/datasets/phonbank_arabic_kuwaiti_corpus.json new file mode 100644 index 0000000..74045df --- /dev/null +++ b/datasets/phonbank_arabic_kuwaiti_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "PhonBank Arabic Kuwaiti Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://phon.talkbank.org/access/Other/Arabic/Kuwaiti.html", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Domain": "other", + "Form": "spoken", + "Collection Style": "other", + "Description": "The data collection took place in Kuwait. A total of 70 children, in groups of ten, were sampled from the general Kuwaiti population. Residents of Kuwait originating from neighbouring countries were excluded. The subjects were randomly selected from variable ethnic backgrounds and social classes, concentrating on inner city population. The sample was divided into seven gender- balanced subgroups with the following age ranges: 1;4-1;7, 1;8-1;11, 2;0-2;3, 2;4-2;7, 2;8-2:11, 3;0-3;3 and 3;4-3;7. Spontaneous speech sample was audio and video recorded on a single occasion. The parent was instructed to interact spontaneously with his or her child for 30 minutes. A set of rubber toys and picture books were made available during all recording sessions. The parent/child spontaneous interaction was digitally recorded with an Edirol R-09HR Handheld SD Recorder and a Shure PG14/PG185 Lavalier wireless microphone system attached to a custom made vest and connected wirelessly to the recorder. The vest was made to hold the microphone transmitter on the child\u2019s back and the lavalier microphone was attached approximately 10-15 centimetres below the child\u2019s chin.", + "Volume": "nan", + "Unit": "hours", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Early Phonological Acquisition by Kuwaiti Arabic Children", + "Paper Link": "https://core.ac.uk/download/pdf/153779285.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Shaima AlQattan, Ghada Khattab", + "Affiliations": "Shaikh Salem Al-Ali Centre,Newcastle University", + "Abstract": "This is the first exploration of typical phonological development in the speech of children acquiring Kuwaiti-Arabic (KA) before the age of 4;0. In many of the word\u2019s languages, salient aspects of the ambient language have been shown to influence the child\u2019s initial progress in language acquisition (Vihman, 1996, 2014); however, studies of phonological development of Arabic lack adequate information on the extent of the influence of factors such as frequency of occurrence of certain features and their phonological salience on the early stages of speech acquisition. A cross-sectional study design was adapted in this thesis to explore the speech of 70 typically developing children. The children were sampled from the Arabic-speaking Kuwaiti population; the children were aged 1;4 and 3;7 and gender-balanced. Spontaneous speech samples were obtained from audio and video recordings of the children while interacting with their parent for 30-minutes. The production accuracy of KA consonants was examined to explore the influence of type and token frequencies on order of consonant acquisition and the development of error patterns. The sonority index was also used to predict the order of consonant acquisition cross-linguistically. The findings were then compared with those of other dialects of Arabic to identify within-language variability and with studies on English to address cross-linguistic differences between Arabic and English early phonological development. The results are partially consistent with accounts that argue for a significant role of input frequency in determining rate and order of consonant acquisition within a language. The development of KA error patterns also shows relative sensitivity to consonant frequency. The sonority index does not always help in the prediction of all Arabic consonants, and the developmental error patterns and early word structures in Arabic and English are significantly distinct. The outcomes of this study provide essential knowledge about typical Arabic phonological development and the first step towards building a standardised phonological test for Arabic speaking children. ", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/phonemes_of_arabic.json b/datasets/phonemes_of_arabic.json new file mode 100644 index 0000000..e1c3b67 --- /dev/null +++ b/datasets/phonemes_of_arabic.json @@ -0,0 +1,36 @@ +{ + "Name": "Phonemes of Arabic", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2020S13", + "License": "LDC User Agreement for Non-Members", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Arabic has three short vowels, three long vowels and 28 consonants. Speakers recorded all sounds and repeated each sound three times. Each speaker also recorded 24 Arabic words with a specified consonant-vowel pattern and repeated each word three times.", + "Volume": "1", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "250.00 $", + "Test Split": "No", + "Tasks": "speech recognition,language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/phrasebank__collins_multilingual_database_(mld).json b/datasets/phrasebank__collins_multilingual_database_(mld).json new file mode 100644 index 0000000..1ff4b4c --- /dev/null +++ b/datasets/phrasebank__collins_multilingual_database_(mld).json @@ -0,0 +1,36 @@ +{ + "Name": "PhraseBank: Collins Multilingual database (MLD)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-T0377/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The PhraseBank consists of 2,000 phrases in 28 languages (Arabic, Chinese, Croatian, Czech, Danish, Dutch, American English, British English, Farsi, Finnish, French, German, Greek, Hindi, Italian, Japanese, Korean, Norwegian, Polish, Portuguese (Iberian), Portuguese (Brazilian), Russian, Spanish (Iberian), Spanish (Latin American), Swedish, Thai, Turkish, Vietnamese). Phrases are organised under 12 main topics and 67 subtopics. Covered topics are: talking to people, getting around, accommodation, shopping, leisure, communications, practicalities, health and beauty, eating and drinking, time.", + "Volume": "2,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "2,240.00\u20ac", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/polyglot-ner.json b/datasets/polyglot-ner.json new file mode 100644 index 0000000..994931c --- /dev/null +++ b/datasets/polyglot-ner.json @@ -0,0 +1,36 @@ +{ + "Name": "POLYGLOT-NER", + "Subsets": [], + "HF Link": "nan", + "Link": "https://www3.cs.stonybrook.edu/~polyglot/ner2/", + "License": "unknown", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Polyglot-NER A training dataset automatically generated from Wikipedia and Freebase the task of named entity recognition. The dataset contains the basic Wikipedia based training data for 40 languages we have (with coreference resolution) for the task of named entity recognition. The details of the procedure of generating them is outlined in Section 3 of the paper (https://arxiv.org/abs/1410.3791). Each config contains the data corresponding to a different language. For example, \"es\" includes only spanish examples.", + "Volume": "10,000,144", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Stony Brook University", + "Derived From": "nan", + "Paper Title": "POLYGLOT-NER: Massive Multilingual Named Entity Recognition", + "Paper Link": "https://arxiv.org/pdf/1410.3791.pdf", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "arXiv", + "Citations": "161.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Rami Al-Rfou, Vivek Kulkarni, Bryan Perozzi, Steven Skiena", + "Affiliations": "Stony Brook University", + "Abstract": "The increasing diversity of languages used on the web introduces a new level of complexity to Information Retrieval (IR) systems. We can no longer assume that textual content is written in one language or even the same language family. In this paper, we demonstrate how to build massive multilingual annotators with minimal human expertise and intervention. We describe a system that builds Named Entity Recognition (NER) annotators for 40 major languages using Wikipedia and Freebase. Our approach does not require NER human annotated datasets or language specific resources like treebanks, parallel corpora, and orthographic rules. The novelty of approach lies therein - using only language agnostic techniques, while achieving competitive performance. Our method learns distributed word representations (word embeddings) which encode semantic and syntactic features of words in each language. Then, we automatically generate datasets from Wikipedia link structure and Freebase attributes. Finally, we apply two preprocessing stages (oversampling and exact surface form matching) which do not require any linguistic expertise. Our evaluation is two fold: First, we demonstrate the system performance on human annotated datasets. Second, for languages where no gold-standard benchmarks are available, we propose a new method, distant evaluation, based on statistical machine translation.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/prague_arabic_dependency_treebank_1_0.json b/datasets/prague_arabic_dependency_treebank_1_0.json new file mode 100644 index 0000000..b5d065c --- /dev/null +++ b/datasets/prague_arabic_dependency_treebank_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "Prague Arabic Dependency Treebank 1.0", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2004T23", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The corpus of PADT 1.0 consists of morphologically and analytically annotated newswire texts of Modern Standard Arabic, which originate from the Arabic Gigaword (LDC2003T12) and the plain data of Arabic Treebank: Part 1 v 2.0 (LDC2003T06) and Arabic Treebank: Part 2 v 2.0 (LDC2004T02).", + "Volume": "212,500", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "100.00 $", + "Test Split": "No", + "Tasks": "cross-lingual information retrieval,information extraction,information retrieval,language modeling,language teaching,machine translation,parsing", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/ptcc.json b/datasets/ptcc.json new file mode 100644 index 0000000..fa0bc46 --- /dev/null +++ b/datasets/ptcc.json @@ -0,0 +1,36 @@ +{ + "Name": "PTCC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/AMR-KELEG/PTCC", + "Link": "https://hf.co/datasets/AMR-KELEG/PTCC", + "License": "MIT License", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "a corpus of 149 articles extracted from the 2014 Tunisian Consititution written in Modern Standard Arabic and Tunisian Arabic.", + "Volume": "149", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, language modeling, dialect identification, text generation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Amr Keleg", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/pulpo.json b/datasets/pulpo.json new file mode 100644 index 0000000..bdd0bd3 --- /dev/null +++ b/datasets/pulpo.json @@ -0,0 +1,36 @@ +{ + "Name": "pulpo", + "Subsets": [], + "HF Link": "https://hf.co/datasets/linhd-postdata/pulpo", + "Link": "https://hf.co/datasets/linhd-postdata/pulpo", + "License": "unknown", + "Year": 2023, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "PULPO, the Prolific Unannotated Literary Poetry Corpus, is a set of multilingual corpora of verses and stanzas with over 95M words", + "Volume": "256,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "ALBERTI, a Multilingual Domain Specific Language Model for Poetry Analysis", + "Paper Link": "https://arxiv.org/pdf/2307.01387", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "poetry analysis", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "The computational analysis of poetry is limited by the scarcity of tools to automatically analyze and scan poems. In a\nmultilingual settings, the problem is exacerbated as scansion and rhyme systems only exist for individual languages, making\ncomparative studies very challenging and time consuming. In this work, we present Alberti, the first multilingual pretrained large language model for poetry. Through domain-specific pre-training (DSP), we further trained multilingual BERT\non a corpus of over 12 million verses from 12 languages. We evaluated its performance on two structural poetry tasks:\nSpanish stanza type classification, and metrical pattern prediction for Spanish, English and German. In both cases, Alberti\noutperforms multilingual BERT and other tranformers-based models of similar sizes, and even achieves state-of-the-art results\nfor German when compared to rule-based systems, demonstrating the feasibility and effectiveness of DSP in the poetry\ndomain.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/qa4mre.json b/datasets/qa4mre.json new file mode 100644 index 0000000..64ac73f --- /dev/null +++ b/datasets/qa4mre.json @@ -0,0 +1,36 @@ +{ + "Name": "QA4MRE", + "Subsets": [], + "HF Link": "https://hf.co/datasets/community-datasets/qa4mre", + "Link": "http://nlp.uned.es/clef-qa/repository/qa4mre.php", + "License": "unknown", + "Year": 2013, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "QA4MRE dataset was created for the CLEF 2011/2012/2013 shared tasks to promote research in question answering and reading comprehension. The dataset contains a supporting passage and a set of questions corresponding to the passage. Multiple options for answers are provided for each question, of which only one is correct. The training and test datasets are available for the main track. Additional gold standard documents are available for two pilot studies: one on alzheimers data, and the other on entrance exams data.", + "Volume": "160", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-642-40802-1_29", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "multiple choice question answering ", + "Venue Title": "CLEF", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Conference and Labs of the Evaluation Forum", + "Authors": "Anselmo Pe\u00f1as, Eduard Hovy, Pamela Forner, \u00c1lvaro Rodrigo, Richard Sutcliffe & Roser Morante ", + "Affiliations": "nan", + "Abstract": "This paper describes the methodology for testing the performance of Machine Reading systems through Question Answering and Reading Comprehension Tests. This was the attempt of the QA4MRE challenge which was run as a Lab at CLEF 2011\u20132013. The traditional QA task was replaced by a new Machine Reading task, whose intention was to ask questions that required a deep knowledge of individual short texts and in which systems were required to choose one answer, by analysing the corresponding test document in conjunction with background text collections provided by the organization. Four different tasks have been organized during these years: Main Task, Processing Modality and Negation for Machine Reading, Machine Reading of Biomedical Texts about Alzheimer\u2019s disease, and Entrance Exams. This paper describes their motivation, their goals, their methodology for preparing the data sets, their background collections, their metrics used for the evaluation, and the lessons learned along these three years.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/qabas.json b/datasets/qabas.json new file mode 100644 index 0000000..3195a0e --- /dev/null +++ b/datasets/qabas.json @@ -0,0 +1,36 @@ +{ + "Name": "Qabas", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/qabas", + "License": "CC BY-ND 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "An open-source Arabic lexicon was designed to support Natural Language Processing (NLP) applications. It uniquely integrates lemmas from 110 different lexicons, linking over 58,000 lemmas, including 45,000 nominal lemmas, 12,500 verbal lemmas, and 473 functional-word lemmas. Additionally, Qabas is linked to 12 morphologically annotated corpora, totaling around 2 million tokens, making it the first Arabic lexicon to combine both lexicons and corpora. Developed semi-automatically through a mapping framework and a web-based tool.", + "Volume": "60,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Qabas: An Open-Source Arabic Lexicographic Database", + "Paper Link": "https://arxiv.org/abs/2406.06598", + "Script": "Arab", + "Tokenized": "No", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, information retrieval, part of speech tagging, morphological analysis, review classification, fake news detection, irony detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mustafa Jarrar, Tymaa Hammouda", + "Affiliations": "nan", + "Abstract": "We present Qabas, a novel open-source Arabic lexicon designed for NLP applications. The novelty of Qabas lies in its synthesis of 110 lexicons. Specifically, Qabas lexical entries (lemmas) are assembled by linking lemmas from 110 lexicons. Furthermore, Qabas lemmas are also linked to 12 morphologically annotated corpora (about 2M tokens), making it the first Arabic lexicon to be linked to lexicons and corpora. Qabas was developed semi-automatically, utilizing a mapping framework and a web-based tool. Compared with other lexicons, Qabas stands as the most extensive Arabic lexicon, encompassing about 58K lemmas (45K nominal lemmas, 12.5K verbal lemmas, and 473 functional-word lemmas). Qabas is open-source and accessible online at https://sina.birzeit.edu/qabas. ", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/qac__qatari_arabic_corpus.json b/datasets/qac__qatari_arabic_corpus.json new file mode 100644 index 0000000..7213b80 --- /dev/null +++ b/datasets/qac__qatari_arabic_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "QAC: Qatari Arabic Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "http://www.isle.illinois.edu/dialect/QAC/index.html", + "License": "CC BY 4.0", + "Year": 2014, + "Language": "ar", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling", + "Description": "Speech was recorded from four Qatari television programs in 2009-2011:", + "Volume": "18.45", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Qatar University", + "Derived From": "nan", + "Paper Title": "Development of a TV Broadcasts Speech Recognition System for Qatari Arabic", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2014/pdf/430_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Mohamed Elmahdy, Mark Hasegawa-Johnson, Eiman Mustafawi", + "Affiliations": "Qatar University; University of Illinois ;Qatar University", + "Abstract": "A major problem with dialectal Arabic speech recognition is due to the sparsity of speech resources. In this paper, a transfer learning\nframework is proposed to jointly use a large amount of Modern Standard Arabic (MSA) data and little amount of dialectal Arabic data to\nimprove acoustic and language modeling. The Qatari Arabic (QA) dialect has been chosen as a typical example for an under-resourced\nArabic dialect. A wide-band speech corpus has been collected and transcribed from several Qatari TV series and talk-show programs.\nA large vocabulary speech recognition baseline system was built using the QA corpus. The proposed MSA-based transfer learning\ntechnique was performed by applying orthographic normalization, phone mapping, data pooling, acoustic model adaptation, and system\ncombination. The proposed approach can achieve more than 28% relative reduction in WER.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/qadi_arabic.json b/datasets/qadi_arabic.json new file mode 100644 index 0000000..326da0a --- /dev/null +++ b/datasets/qadi_arabic.json @@ -0,0 +1,145 @@ +{ + "Name": "QADI Arabic", + "Subsets": [ + { + "Name": "AE", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "28,011", + "Unit": "sentences" + }, + { + "Name": "BH", + "Dialect": "ar-BH: (Arabic (Bahrain))", + "Volume": "28,479", + "Unit": "sentences" + }, + { + "Name": "DZ", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "17,773", + "Unit": "sentences" + }, + { + "Name": "EG", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "67,983", + "Unit": "sentences" + }, + { + "Name": "IQ", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "18,545", + "Unit": "sentences" + }, + { + "Name": "JO", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "34,289", + "Unit": "sentences" + }, + { + "Name": "KW", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Volume": "50,153", + "Unit": "sentences" + }, + { + "Name": "LB", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "38,580", + "Unit": "sentences" + }, + { + "Name": "LY", + "Dialect": "ar-LY: (Arabic (Libya))", + "Volume": "41,052", + "Unit": "sentences" + }, + { + "Name": "MA", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "12,991", + "Unit": "sentences" + }, + { + "Name": "OM", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "24,955", + "Unit": "sentences" + }, + { + "Name": "PL", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "48,814", + "Unit": "sentences" + }, + { + "Name": "QA", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "36,873", + "Unit": "sentences" + }, + { + "Name": "SA", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "35,595", + "Unit": "sentences" + }, + { + "Name": "SD", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Volume": "16,439", + "Unit": "sentences" + }, + { + "Name": "SY", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "18,511", + "Unit": "sentences" + }, + { + "Name": "TN", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "13,094", + "Unit": "sentences" + }, + { + "Name": "YE", + "Dialect": "ar-YE: (Arabic (Yemen))", + "Volume": "11,756", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/QADI", + "Link": "https://alt.qcri.org/resources/qadi", + "License": "Apache-2.0", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Dialects dataset", + "Volume": "540,590", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Arabic Dialect Identification in the Wild", + "Paper Link": "https://arxiv.org/pdf/2005.06557.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "dialect identification", + "Venue Title": "ArXiv", + "Citations": "16.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Ahmed Abdelali,Hamdy Mubarak,Younes Samih,Sabit Hassan,Kareem Darwish", + "Affiliations": ",,University Of D\u00fcsseldorf;Computational Linguistics,,", + "Abstract": "We present QADI, an automatically collected dataset of tweets belonging to a wide range of country-level Arabic dialects -covering 18 different countries in the Middle East and North Africa region. Our method for building this dataset relies on applying multiple filters to identify users who belong to different countries based on their account descriptions and to eliminate tweets that are either written in Modern Standard Arabic or contain inappropriate language. The resultant dataset contains 540k tweets from 2,525 users who are evenly distributed across 18 Arab countries. Using intrinsic evaluation, we show that the labels of a set of randomly selected tweets are 91.5% accurate. For extrinsic evaluation, we are able to build effective country-level dialect identification on tweets with a macro-averaged F1-score of 60.6% across 18 classes.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/qasr.json b/datasets/qasr.json new file mode 100644 index 0000000..fcc4552 --- /dev/null +++ b/datasets/qasr.json @@ -0,0 +1,36 @@ +{ + "Name": "QASR", + "Subsets": [], + "HF Link": "nan", + "Link": "https://arabicspeech.org/qasr/", + "License": "unknown", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "This multi-dialect speech dataset contains 2, 000 hours of speech sampled at 16kHz crawled from Aljazeera news channel", + "Volume": "2,000", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "QASR: QCRI Aljazeera Speech Resource\r\nA Large Scale Annotated Arabic Speech Corpus", + "Paper Link": "https://arxiv.org/pdf/2106.13000.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "ACL", + "Citations": "2.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Hamdy Mubarak,Amir Hussein,S. A. Chowdhury", + "Affiliations": ",,", + "Abstract": "We introduce the largest transcribed Arabic speech corpus, QASR1, collected from the broadcast domain. This multi-dialect speech dataset contains 2, 000 hours of speech sampled at 16kHz crawled from Aljazeera news channel. The dataset is released with lightly supervised transcriptions, aligned with the audio segments. Unlike previous datasets, QASR contains linguistically motivated segmentation, punctuation, speaker information among others. QASR is suitable for training and evaluating speech recognition systems, acousticsand/or linguisticsbased Arabic dialect identification, punctuation restoration, speaker identification, speaker linking, and potentially other NLP modules for spoken data. In addition to QASR transcription, we release a dataset of 130M words to aid in designing and training a better language model. We show that end-to-end automatic speech recognition trained on QASR reports a competitive word error rate compared to the previous MGB-2 corpus. We report baseline results for downstream natural language processing tasks such as named entity recognition using speech transcript. We also report the first baseline for Arabic punctuation restoration. We make the corpus available for the research community.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/qatari_heritage_corpus.json b/datasets/qatari_heritage_corpus.json new file mode 100644 index 0000000..19ac5f3 --- /dev/null +++ b/datasets/qatari_heritage_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Qatari heritage corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://data.world/saraalmulla/qatari-heritage-expressions", + "License": "CDLA-Permissive-1.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "qatari heritage expressions dataset with translations", + "Volume": "1,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Hamad Bin Khalifa University", + "Derived From": "nan", + "Paper Title": "Building a Corpus of Qatari Arabic Expressions\r", + "Paper Link": "https://aclanthology.org/2020.osact-1.4.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "data.world", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "OSACT", + "Citations": "1.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Open-Source Arabic Corpora and Processing Tools", + "Authors": "Sara Al-Mulla,W. Zaghouani", + "Affiliations": ",", + "Abstract": "The current Arabic natural language processing resources are mainly build to address the Modern Standard Arabic (MSA), while we witnessed some scattered efforts to build resources for various Arabic dialects such as the Levantine and the Egyptian dialects. We observed a lack of resources for Gulf Arabic and especially the Qatari variety. In this paper, we present the first Qatari idioms and expression corpus of 1000 entries. The corpus was created from on-line and printed sources in addition to transcribed recorded interviews. The corpus covers various Qatari traditional expressions and idioms. To this end, audio recordings were collected from interviews and an online survey questionnaire was conducted to validate our data. This corpus aims to help advance the dialectal Arabic Speech and Natural Language Processing tools and applications for the Qatari dialect.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/qcri_parallel_tweets.json b/datasets/qcri_parallel_tweets.json new file mode 100644 index 0000000..004ad01 --- /dev/null +++ b/datasets/qcri_parallel_tweets.json @@ -0,0 +1,36 @@ +{ + "Name": "QCRI Parallel Tweets ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/alt-qsri/tweets_ar_en_parallel", + "Link": "https://alt.qcri.org/resources/bilingual_corpus_of_parallel_tweets", + "License": "Apache-2.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling", + "Description": "bilingual corpus of Arabic-English parallel tweets and a list of Twitter accounts who post Arabic-English", + "Volume": "166,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "QCRI", + "Derived From": "nan", + "Paper Title": "Constructing a Bilingual Corpus of Parallel Tweets\r", + "Paper Link": "https://aclanthology.org/2020.bucc-1.3.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "BUCC", + "Citations": "3.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Building and Using Comparable Corpora", + "Authors": "Hamdy Mubarak,Sabit Hassan,Ahmed Abdelali", + "Affiliations": ",,", + "Abstract": "In a bid to reach a larger and more diverse audience, Twitter users often post parallel tweets\u2014tweets that contain the same content but are written in different languages. Parallel tweets can be an important resource for developing machine translation (MT) systems among other natural language processing (NLP) tasks. In this paper, we introduce a generic method for collecting parallel tweets. Using this method, we collect a bilingual corpus of English-Arabic parallel tweets and a list of Twitter accounts who post English-Arabictweets regularly. Since our method is generic, it can also be used for collecting parallel tweets that cover less-resourced languages such as Serbian and Urdu. Additionally, we annotate a subset of Twitter accounts with their countries of origin and topic of interest, which provides insights about the population who post parallel tweets. This latter information can also be useful for author profiling tasks.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/quran_hadith_datasets.json b/datasets/quran_hadith_datasets.json new file mode 100644 index 0000000..42d15a5 --- /dev/null +++ b/datasets/quran_hadith_datasets.json @@ -0,0 +1,36 @@ +{ + "Name": "Quran Hadith Datasets", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Quran_Hadith", + "Link": "https://github.com/ShathaTm/Quran_Hadith_Datasets", + "License": "unknown", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "The datasets showcase the related and non-related pairs of Quran-Quran and Quran-Hadith. It has Classical Arabic and English translated verses and teachings.", + "Volume": "20,360", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Challenging the Transformer-based models with a Classical Arabic dataset: Quran and Hadith", + "Paper Link": "nan", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "semantic similarity", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Shatha Altammami, Eric Atwell", + "Affiliations": "University of Leeds/King Saud University", + "Abstract": "nan", + "Added By": "Abdullah Alsaleh" +} \ No newline at end of file diff --git a/datasets/quran_speech__imam_+_users.json b/datasets/quran_speech__imam_+_users.json new file mode 100644 index 0000000..e92a527 --- /dev/null +++ b/datasets/quran_speech__imam_+_users.json @@ -0,0 +1,36 @@ +{ + "Name": "Quran Speech: Imam + Users", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/quran_uthmani", + "Link": "https://github.com/tarekeldeeb/DeepSpeech-Quran/tree/master/data/quran", + "License": "CC BY 4.0", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling and annotation(other)", + "Description": "7 full Quran recitations + 18K filtered user recitation", + "Volume": "61,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "quran.ksu.edu.sa + tarteel.io", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Tarek Eldeeb" +} \ No newline at end of file diff --git a/datasets/quranic_arabic_corpus.json b/datasets/quranic_arabic_corpus.json new file mode 100644 index 0000000..f9d8e03 --- /dev/null +++ b/datasets/quranic_arabic_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Quranic Arabic Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://corpus.quran.com/download/", + "License": "custom", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "morphology of quranic corpus", + "Volume": "128,218", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Latn", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "morphological analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/rats_keyword_spotting.json b/datasets/rats_keyword_spotting.json new file mode 100644 index 0000000..1540107 --- /dev/null +++ b/datasets/rats_keyword_spotting.json @@ -0,0 +1,36 @@ +{ + "Name": "RATS Keyword Spotting", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2017S20", + "License": "LDC User Agreement for Non-Members", + "Year": 2017, + "Language": "multilingual", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The source audio consists of conversational telephone speech recordings collected by LDC: (1) data collected for the RATS program from Levantine Arabic and Farsi speakers; and (2) material from Levantine Arabic QT Training Data Set 5, Speech (LDC2006S29) and CALLFRIEND Farsi Second Edition Speech (LDC2014S01).", + "Volume": "400", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "7,500.00 $", + "Test Split": "No", + "Tasks": "keyword spotting", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/rats_language_identification.json b/datasets/rats_language_identification.json new file mode 100644 index 0000000..81db864 --- /dev/null +++ b/datasets/rats_language_identification.json @@ -0,0 +1,36 @@ +{ + "Name": "RATS Language Identification", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018S10", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The source audio consists of conversational telephone speech recordings from: (1) conversational telephone speech (CTS) recordings, taken either from previous LDC CTS corpora, or from CTS data collected specifically for the RATS program from Levantine Arabic, Pashto, Urdu, Farsi and Dari native speakers; and (2) portions of VOA broadcast news recordings, taken from data used in the 2009 NIST Language Recognition Evaluation. The 2009 LRE Test Set is available from LDC as LDC2014S06.", + "Volume": "600", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "7,500.00 $", + "Test Split": "No", + "Tasks": "language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/rats_speaker_identification.json b/datasets/rats_speaker_identification.json new file mode 100644 index 0000000..b43e225 --- /dev/null +++ b/datasets/rats_speaker_identification.json @@ -0,0 +1,36 @@ +{ + "Name": "RATS Speaker Identification", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2021S08", + "License": "LDC User Agreement for Non-Members", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The source audio consists of conversational telephone speech recordings collected by LDC specifically for the RATS program from Levantine Arabic, Pashto, Urdu, Farsi and Dari native speakers. Annotations on the audio files include start time, end time, speech activity detection (SAD) label, SAD provenance, speaker ID, speaker ID provenance, language ID, and language ID provenance.", + "Volume": "1,900", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "7,500.00 $", + "Test Split": "No", + "Tasks": "speaker identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/rats_speech_activity_detection.json b/datasets/rats_speech_activity_detection.json new file mode 100644 index 0000000..7996e5f --- /dev/null +++ b/datasets/rats_speech_activity_detection.json @@ -0,0 +1,36 @@ +{ + "Name": "RATS Speech Activity Detection", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2015S02", + "License": "LDC User Agreement for Non-Members", + "Year": 2015, + "Language": "multilingual", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The source audio consists of conversational telephone speech recordings collected by LDC: (1) data collected for the RATS program from Levantine Arabic, Farsi, Pashto and Urdu speakers; and (2) material from the Fisher English (LDC2004S13, LDC2005S13), and Fisher Levantine Arabic telephone studies (LDC2007S02), as well as from CALLFRIEND Farsi (LDC2014S01).", + "Volume": "350", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "7,500.00 $", + "Test Split": "No", + "Tasks": "speech activity detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/reflex_entity_translation_training_devtest.json b/datasets/reflex_entity_translation_training_devtest.json new file mode 100644 index 0000000..3f33c9d --- /dev/null +++ b/datasets/reflex_entity_translation_training_devtest.json @@ -0,0 +1,36 @@ +{ + "Name": "REFLEX Entity Translation Training/DevTest", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T11", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "Please use this link for a sample.", + "Volume": "22,500", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/religious_hate_speech.json b/datasets/religious_hate_speech.json new file mode 100644 index 0000000..a8c3531 --- /dev/null +++ b/datasets/religious_hate_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "Religious Hate Speech", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Religious_Hate_Speech", + "Link": "https://github.com/nuhaalbadi/Arabic_hatespeech", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "training dataset contains 5,569 examples, while the testing dataset contains 567 examples collected from twittter", + "Volume": "6,136", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Are they Our Brothers? Analysis and Detection of Religious Hate Speech in the Arabic Twittersphere", + "Paper Link": "https://ieeexplore.ieee.org/document/8508247/authors#authors", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "hate speech detection", + "Venue Title": "ASONAM", + "Citations": "72.0", + "Venue Type": "conference", + "Venue Name": "IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining", + "Authors": "Nuha Albadi,Maram Kurdi,Shivakant Mishra", + "Affiliations": ",,", + "Abstract": "Religious hate speech in the Arabic Twittersphere is a notable problem that requires developing automated tools to detect messages that use inflammatory sectarian language to promote hatred and violence against people on the basis of religious affiliation. Distinguishing hate speech from other profane and vulgar language is quite a challenging task that requires deep linguistic analysis. The richness of the Arabic morphology and the limited available resources for the Arabic language make this task even more challenging. To the best of our knowledge, this paper is the first to address the problem of identifying speech promoting religious hatred in the Arabic Twitter. In this work, we describe how we created the first publicly available Arabic dataset annotated for the task of religious hate speech detection and the first Arabic lexicon consisting of terms commonly found in religious discussions along with scores representing their polarity and strength. We then developed various classification models using lexicon-based, n-gram-based, and deep-learning-based approaches. A detailed comparison of the performance of different models on a completely new unseen dataset is then presented. We find that a simple Recurrent Neural Network (RNN) architecture with Gated Recurrent Units (GRU) and pre-trained word embeddings can adequately detect religious hate speech with 0.84 Area Under the Receiver Operating Characteristic curve (AUROC).", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/rewayatech.json b/datasets/rewayatech.json new file mode 100644 index 0000000..a0d9957 --- /dev/null +++ b/datasets/rewayatech.json @@ -0,0 +1,36 @@ +{ + "Name": "Rewayatech", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Rewayatech", + "Link": "https://github.com/aseelad/Rewayatech-Saudi-Stories/", + "License": "CC BY-NC-SA 4.0", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "a collection of Arabic stories written in electronic forms between the years of 2003-2015 by online users using anonymized usernames", + "Volume": "1,267", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Rewayatech: Saudi Web Novels Dataset\r", + "Paper Link": "https://www.preprints.org/manuscript/202008.0628/v1", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "other", + "Citations": "0.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Aseel Addawood,D. Alzeer", + "Affiliations": ",", + "Abstract": "The internet has changed the way people perceived fiction to a new level. For instance, online forums have given people the opportunity to write without revealing their real identities. Especially in the Saudi context, online users were using these forums to write web novels that reflect their culture, lives, concerns, hopes and dreams. In this paper, we describe a dataset that was collected from one of the online forums that was used for sharing web novels among its readers. The collected dataset contains 1,267 novels between 2003-2015. This data set is available to the research community to analyze to gain a better understanding of the social, economical, and behavioral mindset that was manifested in the community in that decade.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/rsac.json b/datasets/rsac.json new file mode 100644 index 0000000..71ba65a --- /dev/null +++ b/datasets/rsac.json @@ -0,0 +1,36 @@ +{ + "Name": "RSAC", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/asooft/Sentiment-Analysis-Hotel-Reviews-Dataset", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "reviews", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This dataset contains 6318 hotel reviews collected from the Booking.com website. The reviews are written in both standard and dialectical Arabic and manually annotated as either positive or negative.", + "Volume": "8,425", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Minia University", + "Derived From": "nan", + "Paper Title": "Sentiment Analysis for Arabic Reviews using Machine Learning Classification Algorithms", + "Paper Link": "https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9047822 ", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, review classification", + "Venue Title": "ITCE", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference on Innovative Trends in Communication and Computer Engineering", + "Authors": "Awny A. Sayed, Enas Elgeldawi, Alaa M. Zaki, Ahmed R. Galal", + "Affiliations": "Computer Science Department, Faculty of Science, Minia University, Minia, Egypt", + "Abstract": "This paper presents a novel approach for enhancing Arabic sentiment analysis using nine machine learning classifiers on a dataset of hotel reviews. It aims to improve classification accuracy with various pre-processing and feature extraction techniques.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/sa`7r.json b/datasets/sa`7r.json new file mode 100644 index 0000000..15793c3 --- /dev/null +++ b/datasets/sa`7r.json @@ -0,0 +1,36 @@ +{ + "Name": "Sa`7r", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/SaudiIrony", + "Link": "https://github.com/iwan-rg/Saudi-Dialect-Irony-Dataset", + "License": "CC0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The dataset was collected using Twitter API and it consists of 19,810 tweets, 8,089 of them are labeled as ironic tweets.", + "Volume": "19,810", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "King Saud University", + "Derived From": "nan", + "Paper Title": "Sa`7r: A Saudi Dialect Irony Dataset", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "irony detection", + "Venue Title": "OSACT", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": " Open-Source Arabic Corpora and Processing Tools", + "Authors": "Halah AlMazrua, Najla AlHazzani, Amaal AlDawod, Lama AlAwlaqi, Noura AlReshoudi, Hend Al-Khalifa and Luluh AlDhubayi", + "Affiliations": "nan", + "Abstract": "In sentiment analysis, detecting irony is considered a major challenge. The key problem with detecting irony is the difficulty to recognize the implicit and indirect phrases which signifies the opposite meaning. In this paper, we present Sa`7r \u0633\u0627\u062e\u0631the Saudi irony dataset, and describe our efforts in constructing it. The dataset was collected using Twitter API and it consists of 19,810 tweets, 8,089 of them are labeled as ironic tweets. We trained several models for irony detection task using machine learning models and deep learning models. The machine learning models include: K-Nearest Neighbor (KNN), Logistic Regression (LR), Support Vector Machine (SVM), and Na\u00efve Bayes (NB). While the deep learning models include BiLSTM and AraBERT. The detection results show that among the tested machine learning models, the SVM outperformed other classifiers with an accuracy of 0.68. On the other hand, the deep learning models achieved an accuracy of 0.66 in the BiLSTM model and 0.71 in the AraBERT model. Thus, the AraBERT model achieved the most accurate result in detecting irony phrases in Saudi Dialect.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/sad.json b/datasets/sad.json new file mode 100644 index 0000000..e97d5ad --- /dev/null +++ b/datasets/sad.json @@ -0,0 +1,36 @@ +{ + "Name": "SAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/SAD", + "Link": "http://www.cs.stir.ac.uk/~lss/arabic/", + "License": "unknown", + "Year": 2014, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "The Arabic speech corpus for isolated words contains 9992 utterances of 20 words spoken by 50 native male Arabic speakers. ", + "Volume": "6", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "University of Stirling", + "Derived From": "nan", + "Paper Title": "On Improving the Classification Capability\r\nof Reservoir Computing\r\nfor Arabic Speech Recognition", + "Paper Link": "https://link.springer.com/content/pdf/10.1007%2F978-3-319-11179-7_29.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition", + "Venue Title": "ICANN", + "Citations": "13.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Artificial Neural Networks", + "Authors": "A. Alalshekmubarak,Leslie S. Smith", + "Affiliations": ",", + "Abstract": "Designing noise-resilient systems is a major challenge in the field of automated speech recognition (ASR). These systems are crucial for real-world applications where high levels of noise tend to be present. We introduce a noise robust system based on Echo State Networks and Extreme Kernel machines which we call ESNEKM. To evaluate the performance of the proposed system, we used our recently released public Arabic speech dataset and the well-known spoken Arabic digits (SAD) dataset. Different feature extraction methods considered in this study include mel-frequency cepstral coefficients (MFCCs), perceptual linear prediction (PLP) and RASTA- perceptual linear prediction. These extracted features were fed to the ESNEKM and the result compared with a baseline hidden Markov model (HMM), so that nine models were compared in total. ESNEKM models outperformed HMM models under all the feature extraction methods, noise levels, and noise types. The best performance was obtained by the model that combined RASTA-PLP with ESNEKM.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/salma.json b/datasets/salma.json new file mode 100644 index 0000000..7b8efaf --- /dev/null +++ b/datasets/salma.json @@ -0,0 +1,36 @@ +{ + "Name": "SALMA", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/salma/", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "SALMA corpus is part of the Wojood corpus (Jarrar et al., 2022), and was collected from 33 online media sources written in Modern Standard Arabic (MSA) and covering general topics.", + "Volume": "34,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "Wojood", + "Paper Title": "SALMA: Arabic Sense-Annotated Corpus and WSD Benchmarks", + "Paper Link": "https://www.jarrar.info/publications/JMHK23.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "word sense disambiguation", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "Mustafa Jarrar, Sanad Malaysha, Tymaa Hammouda, Mohammed Khalilia", + "Affiliations": "Birzeit University;Birzeit University;Birzeit University;Birzeit University", + "Abstract": "SALMA, the first Arabic sense-annotated corpus, consists of ~34K tokens, which are all senseannotated. The corpus is annotated using two different sense inventories simultaneously (Modern\nand Ghani). SALMA novelty lies in how tokens\nand senses are associated. Instead of linking a\ntoken to only one intended sense, SALMA links\na token to multiple senses and provides a score to\neach sense. A smart web-based annotation tool\nwas developed to support scoring multiple senses\nagainst a given word. In addition to sense annotations, we also annotated the corpus using six types\nof named entities. The quality of our annotations\nwas assessed using various metrics (Kappa, Linear Weighted Kappa, Quadratic Weighted Kappa,\nMean Average Error, and Root Mean Square Error), which show very high inter-annotator agreement. To establish a Word Sense Disambiguation\nbaseline using our SALMA corpus, we developed\nan end-to-end Word Sense Disambiguation system using Target Sense Verification. We used this\nsystem to evaluate three Target Sense Verification\nmodels available in the literature. Our best model\nachieved an accuracy with 84.2% using Modern\nand 78.7% using Ghani. The full corpus and the\nannotation tool are open-source and publicly available at https://sina.birzeit.edu/salma/.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/samer_readability_lexicon.json b/datasets/samer_readability_lexicon.json new file mode 100644 index 0000000..5ca5f38 --- /dev/null +++ b/datasets/samer_readability_lexicon.json @@ -0,0 +1,36 @@ +{ + "Name": "SAMER readability lexicon", + "Subsets": [], + "HF Link": "nan", + "Link": "https://camel.abudhabi.nyu.edu/samer-readability-lexicon/", + "License": "custom", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The SAMER readability lexicon is a large-scale 26,000-lemma leveled readability lexicon for Modern Standard Arabic. The lexicon was manually annotated in triplicate by language professionals from three regions in the Arab world. ", + "Volume": "26,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "NYU Abu Dhabi", + "Derived From": "nan", + "Paper Title": "A Large-Scale Leveled Readability Lexicon for Standard Arabic", + "Paper Link": "https://aclanthology.org/2020.lrec-1.373.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "CAMeL Resources", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "readability assessment", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Muhamed Al Khalil, Nizar Habash, Zhengyang Jiang", + "Affiliations": "New York University Abu Dhabi", + "Abstract": "We present a large-scale 26,000-lemma leveled readability lexicon for Modern Standard Arabic. The lexicon was manually annotated in\ntriplicate by language professionals from three regions in the Arab world. The annotations show a high degree of agreement; and major\ndifferences were limited to regional variations. Comparing lemma readability levels with their frequencies provided good insights in the\nbenefits and pitfalls of frequency-based readability approaches. The lexicon will be publicly available.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/sanad.json b/datasets/sanad.json new file mode 100644 index 0000000..6c6667a --- /dev/null +++ b/datasets/sanad.json @@ -0,0 +1,36 @@ +{ + "Name": "SANAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/SANAD", + "Link": "https://data.mendeley.com/datasets/57zpx667y9/2", + "License": "CC BY 4.0 ", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "textual data collected from three news portals", + "Volume": "194,797", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Sharjah University", + "Derived From": "nan", + "Paper Title": "SANAD: Single-label Arabic News Articles Dataset for automatic text categorization\r\n", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S2352340919304305", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "topic classification", + "Venue Title": "Data in brief", + "Citations": "18.0", + "Venue Type": "journal", + "Venue Name": "Data in brief", + "Authors": "Omar Einea,Ashraf Elnagar,Ridhwan Al Debsi", + "Affiliations": ",,", + "Abstract": "Text Classification is one of the most popular Natural Language Processing (NLP) tasks. Text classification (aka categorization) is an active research topic in recent years. However, much less attention was directed towards this task in Arabic, due to the lack of rich representative resources for training an Arabic text classifier. Therefore, we introduce a large Single-labeled Arabic News Articles Dataset (SANAD) of textual data collected from three news portals. The dataset is a large one consisting of almost 200k articles distributed into seven categories that we offer to the research community on Arabic computational linguistics. We anticipate that this rich dataset would make a great aid for a variety of NLP tasks on Modern Standard Arabic (MSA) textual data, especially for single label text classification purposes. We present the data in raw form. SANAD is composed of three main datasets scraped from three news portals, which are AlKhaleej, AlArabiya, and Akhbarona. SANAD is made public and freely available at https://data.mendeley.com/datasets/57zpx667y9.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/sanadset_650k__data_on_hadith_narrators.json b/datasets/sanadset_650k__data_on_hadith_narrators.json new file mode 100644 index 0000000..3976942 --- /dev/null +++ b/datasets/sanadset_650k__data_on_hadith_narrators.json @@ -0,0 +1,36 @@ +{ + "Name": "Sanadset 650K: Data on Hadith Narrators", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Sanadset", + "Link": "https://data.mendeley.com/datasets/5xth87zwb5/4", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling", + "Description": "Sanadset is a full hadith dataset that contains over 650,986 records collected from 926 historical Arabic books of hadith. This dataset can be used for further investigation and classification of hadiths (Strong/Weak), and narrators (trustworthy/not) using AI techniques, and also it can be used as a linguistic resource tool for Arabic Natural Language Processing.", + "Volume": "650,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Abdelmalek Essa\u00e2di University", + "Derived From": "nan", + "Paper Title": "Sanadset 650K: Data on Hadith Narrators", + "Paper Link": "https://doi.org/10.1016/j.dib.2022.108540", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, topic classification, text generation, named entity recognition, question answering, information retrieval, natural language inference", + "Venue Title": "Data in Brief", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Data in Brief", + "Authors": "Mohammed Mghari,Omar Bouras,Abdelaaziz El Hibaoui", + "Affiliations": "nan", + "Abstract": "The chain of narrators (Sanad) plays a vital role in deciding the authenticity of Islamic hadiths. However, the investigation and validation of such Sanad fully depend on scientists (Hadith Scholars). They ordinarily utilize their acquired knowledge, which in this manner needs a critical sum of exertion and time.\n\nAutomated Sanad evaluation using machine learning algorithms is the best way to solve this problem. Therefore, a representative Sanad dataset is required.\n\nThis paper presents a full hadith dataset which is named Sanadset and is made openly accessible for researchers. Sanadset corpus contains over 650,986 records collected from 926 historical Arabic books of hadith. This dataset can be used for further investigation and classification of hadiths (Strong/Weak), and narrators (trustworthy/not) using AI techniques, and also it can be used as a linguistic resource tool for Arabic Natural Language Processing.\n\nOur dataset is collected from online Hadith sources using data scraping and web crawling. The main contribution of this dataset is the extraction of narrator chains that were originally present in textual form within Hadith books. Each observation in the dataset contains complete information about a specific hadith, such as (original book, number, Hadith text, Matn, list of narrators, and the number of narrators).", + "Added By": "Mohammed Mghari" +} \ No newline at end of file diff --git a/datasets/saudinewsnet.json b/datasets/saudinewsnet.json new file mode 100644 index 0000000..157659d --- /dev/null +++ b/datasets/saudinewsnet.json @@ -0,0 +1,36 @@ +{ + "Name": "SaudiNewsNet", + "Subsets": [], + "HF Link": "https://hf.co/datasets/inparallel/saudinewsnet", + "Link": "https://github.com/inparallel/SaudiNewsNet", + "License": "CC BY-NC-SA 4.0", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "a set of 31,030 Arabic newspaper articles alongwith metadata, extracted from various online Saudi newspapers.", + "Volume": "31,030", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, text generation ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/semeval-2017_task_4.json b/datasets/semeval-2017_task_4.json new file mode 100644 index 0000000..798abe4 --- /dev/null +++ b/datasets/semeval-2017_task_4.json @@ -0,0 +1,36 @@ +{ + "Name": "SemEval-2017 Task 4", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/SemEval_2017_ar_subtask_A", + "Link": "https://alt.qcri.org/semeval2017/task4/index.php?id=data-and-tools", + "License": "unknown", + "Year": 2017, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " a large accessible benchmark dataset containing over 70,000 tweets\nacross two languages", + "Volume": "70,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "SemEval-2017 Task 4: Sentiment Analysis in Twitter", + "Paper Link": "https://aclanthology.org/S17-2088.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "SEMEVAL", + "Citations": "14.0", + "Venue Type": "workshop", + "Venue Name": "International Workshop on Semantic Evaluation", + "Authors": "Sara Rosenthal,N. Farra,Preslav Nakov", + "Affiliations": ",,", + "Abstract": "This paper describes the fifth year of the Sentiment Analysis in Twitter task. SemEval-2017 Task 4 continues with a rerun of the subtasks of SemEval-2016 Task 4, which include identifying the overall sentiment of the tweet, sentiment towards a topic with classification on a two-point and on a five-point ordinal scale, and quantification of the distribution of sentiment towards a topic across a number of tweets: again on a two-point and on a five-point ordinal scale. Compared to 2016, we made two changes: (i) we introduced a new language, Arabic, for all subtasks, and (ii) we made available information from the profiles of the Twitter users who posted the target tweets. The task continues to be very popular, with a total of 48 teams participating this year.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/semeval-2018_task_1.json b/datasets/semeval-2018_task_1.json new file mode 100644 index 0000000..d6d8d85 --- /dev/null +++ b/datasets/semeval-2018_task_1.json @@ -0,0 +1,36 @@ +{ + "Name": "SemEval-2018 Task 1", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SemEvalWorkshop/sem_eval_2018_task_1", + "Link": "https://competitions.codalab.org/competitions/17751#learn_the_details-datasets", + "License": "unknown", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "it creates a new Affect in Tweets dataset of more than 22,000 tweets such\nthat subsets are annotated for a number of emotion dimensions.", + "Volume": "22,000", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "SemEval-2018 Task 1: Affect in Tweets", + "Paper Link": "https://aclanthology.org/S18-1001.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "CodaLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "emotion intensity regression, emotion intensity classification, valence regression, \nvalence classification, emotion detection", + "Venue Title": "SEMEVAL", + "Citations": "322.0", + "Venue Type": "workshop", + "Venue Name": "International Workshop on Semantic Evaluation", + "Authors": "Saif M. Mohammad,Felipe Bravo-Marquez,Mohammad Salameh,Svetlana Kiritchenko", + "Affiliations": "National Research Council Canada,University of Chile,,", + "Abstract": "We present the SemEval-2018 Task 1: Affect in Tweets, which includes an array of subtasks on inferring the affectual state of a person from their tweet. For each task, we created labeled data from English, Arabic, and Spanish tweets. The individual tasks are: 1. emotion intensity regression, 2. emotion intensity ordinal classification, 3. valence (sentiment) regression, 4. valence ordinal classification, and 5. emotion classification. Seventy-five teams (about 200 team members) participated in the shared task. We summarize the methods, resources, and tools used by the participating teams, with a focus on the techniques and resources that are particularly useful. We also analyze systems for consistent bias towards a particular race or gender. The data is made freely available to further improve our understanding of how people convey emotions through language.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/semeval-2021_task_2.json b/datasets/semeval-2021_task_2.json new file mode 100644 index 0000000..d6b0e94 --- /dev/null +++ b/datasets/semeval-2021_task_2.json @@ -0,0 +1,36 @@ +{ + "Name": "SemEval-2021 Task 2", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/MCL_WiC_ar", + "Link": "https://github.com/SapienzaNLP/mcl-wic", + "License": "CC BY-NC 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "the first entirely manually-curated WiC-style dataset in five European and non-European languages, namely Arabic, Chinese, English, French and Russian.", + "Volume": "2,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Sapienza University of Rome, Italy", + "Derived From": "nan", + "Paper Title": "SemEval-2021 Task 2: Multilingual and Cross-lingual Word-in-Context Disambiguation (MCL-WiC)", + "Paper Link": "https://aclanthology.org/2021.semeval-1.3.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "word sense disambiguation", + "Venue Title": "SEMEVAL", + "Citations": "22.0", + "Venue Type": "workshop", + "Venue Name": "International Workshop on Semantic Evaluation", + "Authors": "Federico Martelli,N. Kalach,Gabriele Tola,Roberto Navigli", + "Affiliations": ",,,", + "Abstract": "In this paper, we introduce the first SemEval task on Multilingual and Cross-Lingual Word-in-Context disambiguation (MCL-WiC). This task allows the largely under-investigated inherent ability of systems to discriminate between word senses within and across languages to be evaluated, dropping the requirement of a fixed sense inventory. Framed as a binary classification, our task is divided into two parts. In the multilingual sub-task, participating systems are required to determine whether two target words, each occurring in a different context within the same language, express the same meaning or not. Instead, in the cross-lingual part, systems are asked to perform the task in a cross-lingual scenario, in which the two target words and their corresponding contexts are provided in two different languages. We illustrate our task, as well as the construction of our manually-created dataset including five languages, namely Arabic, Chinese, English, French and Russian, and the results of the participating systems. Datasets and results are available at: https://github.com/SapienzaNLP/mcl-wic.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/senti_lex.json b/datasets/senti_lex.json new file mode 100644 index 0000000..51735f2 --- /dev/null +++ b/datasets/senti_lex.json @@ -0,0 +1,36 @@ +{ + "Name": "Senti lex", + "Subsets": [], + "HF Link": "https://hf.co/datasets/senti-lex/senti_lex", + "Link": "https://www.kaggle.com/datasets/rtatman/sentiment-lexicons-for-81-languages", + "License": "GPL-3.0", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This dataset add sentiment lexicons for 81 languages generated via graph propagation based on a knowledge graph--a graphical representation of real-world entities and the links between them", + "Volume": "2,794", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Stony Brook University", + "Derived From": "nan", + "Paper Title": "Building Sentiment Lexicons for All Major Languages", + "Paper Link": "https://aclanthology.org/P14-2063.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "ACL", + "Citations": "186.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Yanqing Chen, S. Skiena", + "Affiliations": "Stony Brook University", + "Abstract": "Sentiment analysis in a multilingual world remains a challenging problem, because developing language-specific sentiment lexicons is an extremely resourceintensive process. Such lexicons remain a scarce resource for most languages. In this paper, we address this lexicon gap by building high-quality sentiment lexicons for 136 major languages. We integrate a variety of linguistic resources to produce an immense knowledge graph. By appropriately propagating from seed words, we construct sentiment lexicons for each component language of our graph. Our lexicons have a polarity agreement of 95.7% with published lexicons, while achieving an overall coverage of 45.2%. We demonstrate the performance of our lexicons in an extrinsic analysis of 2,000 distinct historical figures\u2019 Wikipedia articles on 30 languages. Despite cultural difference and the intended neutrality of Wikipedia articles, our lexicons show an average sentiment correlation of 0.28 across all language pairs.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/sentiment_lexicons_for_81_languages.json b/datasets/sentiment_lexicons_for_81_languages.json new file mode 100644 index 0000000..e34fce5 --- /dev/null +++ b/datasets/sentiment_lexicons_for_81_languages.json @@ -0,0 +1,36 @@ +{ + "Name": "Sentiment Lexicons for 81 Languages", + "Subsets": [], + "HF Link": "https://hf.co/datasets/senti_lex", + "Link": "https://www.kaggle.com/datasets/rtatman/sentiment-lexicons-for-81-languages", + "License": "unknown", + "Year": 2017, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "Sentiment analysis, the task of automatically detecting whether a piece of text is positive or negative, generally relies on a hand-curated list of words with positive sentiment (good, great, awesome) and negative sentiment (bad, gross, awful). This dataset contains both positive and negative sentiment lexicons for 81 languages.", + "Volume": "2,794", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitLab", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis\n\n", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "RACHAEL TATMAN", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Abdelrahman Rezk" +} \ No newline at end of file diff --git a/datasets/senwave.json b/datasets/senwave.json new file mode 100644 index 0000000..5547df1 --- /dev/null +++ b/datasets/senwave.json @@ -0,0 +1,36 @@ +{ + "Name": "SenWave", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/gitdevqiang/SenWave", + "License": "unknown", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "The largest fine-grained annotated Covid-19 tweets dataset", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "SenWave: Monitoring the Global Sentiments under the COVID-19 Pandemic", + "Paper Link": "https://arxiv.org/pdf/2006.10842.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "emotion detection", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Nora Alturayeif" +} \ No newline at end of file diff --git a/datasets/senzi.json b/datasets/senzi.json new file mode 100644 index 0000000..026bd11 --- /dev/null +++ b/datasets/senzi.json @@ -0,0 +1,36 @@ +{ + "Name": "SenZi", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/SenZi", + "Link": "https://tahatobaili.github.io/project-rbz/", + "License": "custom", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "by translating, annotating, and transliterating\r\nother resources to have an initial set of 2K sentiment words. We expanded it to 24.6K sentiment\r\nwords by importing inflectional and orthographic\r\nforms using word embeddings", + "Volume": "24,600", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "SenZi: A Sentiment Analysis Lexicon for the Latinised Arabic (Arabizi)\r", + "Paper Link": "https://aclanthology.org/R19-1138.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, transliteration", + "Venue Title": "RANLP", + "Citations": "5.0", + "Venue Type": "conference", + "Venue Name": "Recent Advances in Natural Language Processing", + "Authors": "Taha Tobaili,Miriam Fern\u00e1ndez,Harith Alani,S. Sharafeddine,Hazem M. Hajj,Goran Glavas", + "Affiliations": ",,,,,", + "Abstract": "Arabizi is an informal written form of dialectal Arabic transcribed in Latin alphanumeric characters. It has a proven popularity on chat platforms and social media, yet it suffers from a severe lack of natural language processing (NLP) resources. As such, texts written in Arabizi are often disregarded in sentiment analysis tasks for Arabic. In this paper we describe the creation of a sentiment lexicon for Arabizi that was enriched with word embeddings. The result is a new Arabizi lexicon consisting of 11.3K positive and 13.3K negative words. We evaluated this lexicon by classifying the sentiment of Arabizi tweets achieving an F1-score of 0.72. We provide a detailed error analysis to present the challenges that impact the sentiment analysis of Arabizi.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/serag__semantic_entity_retrieval_from_arabic_knowledge_graphs.json b/datasets/serag__semantic_entity_retrieval_from_arabic_knowledge_graphs.json new file mode 100644 index 0000000..3b35120 --- /dev/null +++ b/datasets/serag__semantic_entity_retrieval_from_arabic_knowledge_graphs.json @@ -0,0 +1,36 @@ +{ + "Name": "SERAG: Semantic Entity Retrieval from Arabic knowledge Graphs", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/ArabicDEv2", + "Link": "https://zenodo.org/record/4560653#.YprApXZBxD8", + "License": "CC BY 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "This is the dataset used in: \"S. Esmeir, SERAG: Semantic Entity Retrieval from Arabic knowledge Graphs, In: Proceedings of the Sixth Arabic Natural Language Processing Workshop (WANLP 2021) at EACL 2021\". The dataset is a translation of a subset (139/467) of the queries in DBpedia Entity v2 to Modern Standard Arabic. We used the \u201cstopped\u201d version of DBpedia Entity v2 (queries-v2_stopped.txt). Please use the query ID to link the translated queries to their English counterparts, and to the relevance judgment files provided with DBpedia Entity v2 (qrels-v2.txt). DBpedia\u2019s interlingual mapping file (interlanguage_links_ar.ttl.bz2) can be used to map entities from English to Arabic and vice-versa.", + "Volume": "139", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "DBpedia Entity v2 ", + "Paper Title": "SERAG: Semantic Entity Retrieval from Arabic Knowledge Graphs", + "Paper Link": "https://aclanthology.org/2021.wanlp-1.24/", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "semantic entity retrieval", + "Venue Title": "ACL", + "Citations": "nan", + "Venue Type": "workshop", + "Venue Name": "The Sixth Arabic Natural Language Processing Workshop (WANLP 2021), ", + "Authors": "Esmeir Saher", + "Affiliations": "Bloomberg L.P", + "Abstract": "Knowledge graphs (KGs) are widely used to store and access information about entities and their relationships. Given a query, the task of entity retrieval from a KG aims at presenting a ranked list of entities relevant to the query. Lately, an increasing number of models for entity retrieval have shown a significant improvement over traditional methods. These models, however, were developed for English KGs. In this work, we build on onesuchsystem, named KEWER, to propose SERAG (Semantic Entity Retrieval from Arabic knowledge Graphs). Like KEWER, SERAG uses random walks to generate entity embeddings. DBpedia-Entity v2 is considered the standard test collection for entity retrieval. We discuss the challenges of using it for non-English languages in general and Arabic in particular. We provide an Arabic version of this standard collection, and use it to evaluate SERAG. SERAG is shown to significantly outperform the popular BM25 model thanks to its multi-hop reasoning.", + "Added By": "Kamel GAANOUN" +} \ No newline at end of file diff --git a/datasets/shakkelha.json b/datasets/shakkelha.json new file mode 100644 index 0000000..76a25a1 --- /dev/null +++ b/datasets/shakkelha.json @@ -0,0 +1,36 @@ +{ + "Name": "Shakkelha", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/shakkelha", + "Link": "https://github.com/AliOsm/shakkelha", + "License": "MIT License", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "other", + "Description": "Arabic text diacritization extension dataset that is should be used for training only. This dataset is an extension of the dataset provided here: https://github.com/AliOsm/arabic-text-diacritization, and both of them were derived from the same source, which is Tashkeela dataset.", + "Volume": "533,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Jordan University of Science and Technology (JUST)", + "Derived From": "Tashkeela", + "Paper Title": "Neural Arabic Text Diacritization: State of the Art Results and a Novel Approach for Machine Translation", + "Paper Link": "https://aclanthology.org/D19-5229/", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "diacritization", + "Venue Title": "EMNLP-IJCNLP", + "Citations": "14.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Asian Translation (WAT)", + "Authors": "Ali Fadel, Ibraheem Tuffaha, Bara\u2019 Al-Jawarneh, and Mahmoud Al-Ayyoub", + "Affiliations": "Jordan University of Science and Technology (JUST)", + "Abstract": "In this work, we present several deep learning models for the automatic diacritization of Arabic text. Our models are built using two main approaches, viz. Feed-Forward Neural Network (FFNN) and Recurrent Neural Network (RNN), with several enhancements such as 100-hot encoding, embeddings, Conditional Random Field (CRF) and Block-Normalized Gradient (BNG). The models are tested on the only freely available benchmark dataset and the results show that our models are either better or on par with other models, which require language-dependent post-processing steps, unlike ours. Moreover, we show that diacritics in Arabic can be used to enhance the models of NLP tasks such as Machine Translation (MT) by proposing the Translation over Diacritization (ToD) approach.", + "Added By": "Ali Hamdi Ali Fadel" +} \ No newline at end of file diff --git a/datasets/shamela.json b/datasets/shamela.json new file mode 100644 index 0000000..a8fe23d --- /dev/null +++ b/datasets/shamela.json @@ -0,0 +1,36 @@ +{ + "Name": "Shamela", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/OpenArabic/", + "License": "unknown", + "Year": 2016, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling", + "Description": "a large-scale, historical corpus of Arabic of about 1 billion\r\nwords from diverse periods of time", + "Volume": "6,100", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "OpenITI", + "Paper Title": "Shamela: A Large-Scale Historical Arabic Corpus\r", + "Paper Link": "https://arxiv.org/pdf/1612.08989.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "LT4DH", + "Citations": "9.0", + "Venue Type": "workshop", + "Venue Name": "Workshop on Language Technology Resources and Tools for Digital Humanities", + "Authors": "Yonatan Belinkov,Alexander Magidow,Maxim Romanov,Avi Shmidman,Moshe Koppel", + "Affiliations": ",,,,", + "Abstract": "Arabic is a widely-spoken language with a rich and long history spanning more than fourteen centuries. Yet existing Arabic corpora largely focus on the modern period or lack sufficient diachronic information. We develop a large-scale, historical corpus of Arabic of about 1 billion words from diverse periods of time. We clean this corpus, process it with a morphological analyzer, and enhance it by detecting parallel passages and automatically dating undated texts. We demonstrate its utility with selected case-studies in which we show its application to the digital humanities.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/shamela_diacritics_corpus.json b/datasets/shamela_diacritics_corpus.json new file mode 100644 index 0000000..8a5862d --- /dev/null +++ b/datasets/shamela_diacritics_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Shamela Diacritics Corpus ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://archive.org/details/shamela-diacritics-corpus", + "License": "unknown", + "Year": 2023, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "books", + "Form": "text", + "Collection Style": "other", + "Description": "An Arabic diacriticized corpus using data from the old Maktaba Shamela website", + "Volume": "1,305", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Independent", + "Derived From": "Old Maktaba Shamela website: https://t.me/shamela_kindle/37678", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "diacritization", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Mohamed H." +} \ No newline at end of file diff --git a/datasets/shamela_et_al_arabic_corpus.json b/datasets/shamela_et_al_arabic_corpus.json new file mode 100644 index 0000000..29db3e4 --- /dev/null +++ b/datasets/shamela_et_al_arabic_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Shamela et al Arabic Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/tarekeldeeb/GloVe-Arabic/tree/master/arabic_corpus", + "License": "CC BY 4.0", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "The arabic corpus {1.9B word} consists of the following resources:\n\nShamelaLibrary348.7z link {1.15B}\nUN arabic corpus mirror1 mirror2 {0.37B}\nAraCorpus.tar.gz link {0.14B}\nArabic Wikipedia Latest Articles Dump link {0.11B}\nTashkeela-arabic-diacritized-text-utf8-0.3.zip link {0.07B}\nArabic Tweets link {0.03B}\nwatan-2004.7z link {0.01B}", + "Volume": "1,754,541,204", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Tarek Eldeeb" +} \ No newline at end of file diff --git a/datasets/shami.json b/datasets/shami.json new file mode 100644 index 0000000..4fbddd0 --- /dev/null +++ b/datasets/shami.json @@ -0,0 +1,61 @@ +{ + "Name": "Shami", + "Subsets": [ + { + "Name": "Jordanian", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "32,078", + "Unit": "sentences" + }, + { + "Name": "Palestanian", + "Dialect": "ar-PS: (Arabic (Palestine))", + "Volume": "21,264", + "Unit": "sentences" + }, + { + "Name": "Syrian", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "48,159", + "Unit": "sentences" + }, + { + "Name": "Lebanese", + "Dialect": "ar-LB: (Arabic (Lebanon))", + "Volume": "16,304", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/Shami", + "Link": "https://github.com/GU-CLASP/shami-corpus", + "License": "Apache-2.0", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "the first Levantine Dialect Corpus (SDC) covering data from the four dialects spoken in Palestine, Jordan, Lebanon and Syria.", + "Volume": "117,805", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions ", + "Derived From": "nan", + "Paper Title": "Shami: A Corpus of Levantine Arabic Dialects", + "Paper Link": "https://aclanthology.org/L18-1576.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "LREC", + "Citations": "25.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Chatrine Qwaider,Motaz Saad,S. Chatzikyriakidis,Simon Dobnik", + "Affiliations": ",The Islamic University of Gaza,,", + "Abstract": "Modern Standard Arabic (MSA) is the official language used in education and media across the Arab world both in writing and formal speech. However, in daily communication several dialects depending on the country, region as well as other social factors, are used. With the emergence of social media, the dialectal amount of data on the Internet have increased and the NLP tools that support MSA are not well-suited to process this data due to the difference between the dialects and MSA. In this paper, we construct the Shami corpus, the first Levantine Dialect Corpus (SDC) covering data from the four dialects spoken in Palestine, Jordan, Lebanon and Syria. We also describe rules for pre-processing without affecting the meaning so that it is processable by NLP tools. We choose Dialect Identification as the task to evaluate SDC and compare it with two other corpora. In this respect, experiments are conducted using different parameters based on n-gram models and Naive Bayes classifiers. SDC is larger than the existing corpora in terms of size, words and vocabularies. In addition, we use the performance on the Language Identification task to exemplify the similarities and differences in the individual dialects.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/snad.json b/datasets/snad.json new file mode 100644 index 0000000..9dd5391 --- /dev/null +++ b/datasets/snad.json @@ -0,0 +1,36 @@ +{ + "Name": "SNAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/SNAD", + "Link": "https://drive.google.com/file/d/1uwD56jaVIbsQQWVqqyL08TgjuTraYFJC/view", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "The data was scraped and collected from two of the most famous news sources in Saudi Arabia", + "Volume": "45,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Prince Sultan University", + "Derived From": "nan", + "Paper Title": "SNAD Arabic Dataset for Deep Learning\r\n", + "Paper Link": "https://link.springer.com/chapter/10.1007/978-3-030-55180-3_47", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "IntelliSys", + "Citations": "1.0", + "Venue Type": "conference", + "Venue Name": "Intelligent Systems Conference", + "Authors": "Deem Alsaleh,Mashael Bin Alamir,Souad Larabi Marie-Sainte", + "Affiliations": ",,", + "Abstract": "Natural language processing (NLP) captured the attention of researchers for the last years. NLP is applied in various applications and several disciplines. Arabic is a language that also benefited from NLP. However, only few Arabic datasets are available for researchers. For that, applying the Arabic NLP is limited in these datasets. Hence, this paper introduces a new dataset, SNAD. SNAD is collected to fill the gap in Arabic datasets, especially for classification using deep learning. The dataset has more than 45,000 records. Each record consists of the news title, news details, in addition to the news class. The dataset has six different classes. Moreover, cleaning and preprocessing are applied to the raw data to make it more efficient for classification purpose. Finally, the dataset is validated using the Convolutional Neural Networks and the result is efficient. The dataset is freely available online.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/sohateful.json b/datasets/sohateful.json new file mode 100644 index 0000000..975c1c2 --- /dev/null +++ b/datasets/sohateful.json @@ -0,0 +1,36 @@ +{ + "Name": "SoHateful", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/rafiulbiswas/hatespeech-detection", + "License": "unknown", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "70,000 Arabic tweets, from which 15,965 tweets were selected and annotated, to identify hate speech patterns and train classification models", + "Volume": "15,965", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "HBKU", + "Derived From": "nan", + "Paper Title": "So Hateful! Building a Multi-Label Hate Speech Annotated Arabic Dataset", + "Paper Link": "https://aclanthology.org/2024.lrec-main.1308.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "hate speech detection", + "Venue Title": "LREC-COLING", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "Wajdi Zaghouani, Hamdy Mubarak, Md. Rafiul Biswas", + "Abstract": "Social media enables widespread propagation of hate speech targeting groups based on ethnicity, religion, or\nother characteristics. With manual content moderation being infeasible given the volume, automatic hate speech\ndetection is essential. This paper analyzes 70,000 Arabic tweets, from which 15,965 tweets were selected and\nannotated, to identify hate speech patterns and train classification models. Annotators labeled the Arabic tweets\nfor offensive content, hate speech, emotion intensity and type, effect on readers, humor, factuality, and spam. Key\nfindings reveal 15% of tweets contain offensive language while 6% have hate speech, mostly targeted towards\ngroups with common ideological or political affiliations. Annotations capture diverse emotions, and sarcasm is more\nprevalent than humor. Additionally, 10% of tweets provide verifiable factual claims, and 7% are deemed important.\nFor hate speech detection, deep learning models like AraBERT outperform classical machine learning approaches.\nBy providing insights into hate speech characteristics, this work enables improved content moderation and reduced\nexposure to online hate. The annotated dataset advances Arabic natural language processing research and resources.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/speech-massive.json b/datasets/speech-massive.json new file mode 100644 index 0000000..e2e0e12 --- /dev/null +++ b/datasets/speech-massive.json @@ -0,0 +1,36 @@ +{ + "Name": "Speech-MASSIVE", + "Subsets": [], + "HF Link": "https://hf.co/datasets/FBK-MT/Speech-MASSIVE", + "Link": "https://hf.co/datasets/FBK-MT/Speech-MASSIVE", + "License": "CC BY-NC-SA 4.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "Speech-MASSIVE is a multilingual Spoken Language Understanding (SLU) dataset comprising the speech counterpart for a portion of the MASSIVE textual corpus. Speech-MASSIVE covers 12 languages (Arabic, German, Spanish, French, Hungarian, Korean, Dutch, Polish, European Portuguese, Russian, Turkish, and Vietnamese) from different families and inherits from MASSIVE the annotations for the intent prediction and slot-filling tasks. MASSIVE utterances' labels span 18 domains, with 60 intents and 55 slots. Full train split is provided for French and German, and for all the 12 languages (including French and German), we provide few-shot train, validation, test splits. Few-shot train (115 examples) covers all 18 domains, 60 intents, and 55 slots (including empty slots).", + "Volume": "2,150", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "Naver Labs", + "Derived From": "MASSIVE", + "Paper Title": "Speech-MASSIVE: A Multilingual Speech Dataset for SLU and Beyond", + "Paper Link": "https://arxiv.org/pdf/2408.03900", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "We present Speech-MASSIVE, a multilingual Spoken Language Understanding (SLU) dataset comprising the speech counterpart for a portion of the MASSIVE textual corpus. Speech-MASSIVE covers 12 languages from different families and inherits from MASSIVE the annotations for the intent\nprediction and slot-filling tasks. Our extension is prompted by the scarcity of massively multilingual SLU datasets and the growing need for versatile speech datasets to assess foundation models\n(LLMs, speech encoders) across languages and tasks. We provide a multimodal, multitask, multilingual dataset and report SLU baselines using both cascaded and end-to-end architectures in various training scenarios (zero-shot, few-shot, and full fine-tune). Furthermore, we demonstrate the\nsuitability of Speech-MASSIVE for benchmarking other tasks such as speech transcription, language\nidentification, and speech translation. The dataset, models, and code are publicly available at:\nhttps://github.com/hlt-mt/Speech-MASSIVE\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/spiral.json b/datasets/spiral.json new file mode 100644 index 0000000..c59bd11 --- /dev/null +++ b/datasets/spiral.json @@ -0,0 +1,36 @@ +{ + "Name": "SPIRAL", + "Subsets": [], + "HF Link": "nan", + "Link": "https://github.com/Dahouabdelhalim/SPIRAL", + "License": "CC BY-SA 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "SPIRAL is a corpus dedicated to the detection and correction of spelling errors in MSA Arabic texts.", + "Volume": "248,441,892", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Automatic Building of a Large Arabic Spelling Error Corpus", + "Paper Link": "https://link.springer.com/article/10.1007/s42979-022-01499-x", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Upon-Request", + "Cost": "0", + "Test Split": "Yes", + "Tasks": "machine translation, text generation, morphological analysis, normalization, spelling error detection", + "Venue Title": "SN Computer Science", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "SN Computer Science", + "Authors": "Shaimaa Ben Aichaoui, Nawel Hiri, Abdelhalim Hafedh Dahou, and Mohamed Amine Cheragui", + "Affiliations": "Mathematics and Computer Science Department, Ahmed Draia University, Adrar, Algeria, gesis leibniz institute for the social sciences, Cologne, Germany", + "Abstract": "Today, for spelling Checker, a classical topic in natural language processing, the corpus has become an important component in the development process, especially with the emergence of stochastic and machine learning approaches that exploit corpus to build resolution models. The aim of our work is based on two phases: the first one is to build a corpus dedicated to the detection and correction of spelling errors in Arabic texts that we call SPIRAL and the second phase is to see the impact of our corpus through an experimental study using a deep learning model which is AraBART. The results obtained using the F1 metric were: 80.2% for morphology error, 81.6% for phonetic error, 73% for physical error, 78.3% for permutation error, 64.3% for keyboard error, 33.7% for delete error, 86% for space-issues error, and 84.5% for tachkil error.", + "Added By": "Abdelhalim Hafedh Dahou" +} \ No newline at end of file diff --git a/datasets/stopword_lists_for_19_languages.json b/datasets/stopword_lists_for_19_languages.json new file mode 100644 index 0000000..ce0274f --- /dev/null +++ b/datasets/stopword_lists_for_19_languages.json @@ -0,0 +1,36 @@ +{ + "Name": "Stopword Lists for 19 Languages", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/stoplist_19_arabic", + "Link": "https://www.kaggle.com/datasets/rtatman/stopword-lists-for-19-languages/download", + "License": "unknown", + "Year": 2017, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Some words, like \u201cthe\u201d or \u201cand\u201d in English, are used a lot in speech and writing. For most Natural Language Processing applications, you will want to remove these very frequent words. This is usually done using a list of \u201cstopwords\u201d which has been complied by hand.", + "Volume": "19", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "kaggle", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "stop words", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Afrah Altamimi" +} \ No newline at end of file diff --git a/datasets/student_university_corpus.json b/datasets/student_university_corpus.json new file mode 100644 index 0000000..094a181 --- /dev/null +++ b/datasets/student_university_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Student University Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/School_Corpus", + "Link": "https://github.com/licvol/Arabic-Spoken-Language-Understanding/tree/master/UniversityStudentCorpus/MonoLingual", + "License": "unknown", + "Year": 2019, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "300 students which formulated their requests to access their information from the education office", + "Volume": "126", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "An Arabic Multi-Domain Spoken Language Understanding System\r", + "Paper Link": "https://aclanthology.org/W19-7407.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "student management", + "Venue Title": "ICNLSP", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Natural Language and Speech Processing", + "Authors": "Mohamed Lichouri,Mourad Abbas,R. Djeradi,A. Djeradi", + "Affiliations": "CRSTDLA,,,", + "Abstract": "In this paper, we suggest the generalization of an Arabic Spoken Language Understanding (SLU) system in a multi-domain humanmachine dialog. We are interested particularly in domain portability of SLU system related to both structured (DBMS) and unstructured data (Information Extraction), related to four domains. In this work, we used the thematic approach for four domains which are School Management, Medical Diagnostics, Consultation domain and Question-Answering domain (DAWQAS). We should note that two kinds of classifiers are used in our experiments: statistical and neural, namely: Gaussian Naive Bayes, Bernoulli Naive Bayes, Logistic Regression, SGD, Passive Aggressive Classifier, Perceptron, Linear Support Vector and Convolutional Neural Network.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/sudanese_dialect_tweets_about_ridesharing_companies.json b/datasets/sudanese_dialect_tweets_about_ridesharing_companies.json new file mode 100644 index 0000000..bd1fc7c --- /dev/null +++ b/datasets/sudanese_dialect_tweets_about_ridesharing_companies.json @@ -0,0 +1,36 @@ +{ + "Name": "Sudanese Dialect tweets about ridesharing companies", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Sudanese_Dialect_Tweet", + "Link": "https://docs.google.com/spreadsheets/d/1bNwimEQFMWtjlsKtL8PH_RNFNjg-b6p3/edit?usp=sharing&ouid=101796411348671465142&rtpof=true&sd=true", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Sentiment Analysis dataset collected from Twitter. It contains people's opinions on Sudanese Ridesharing companies.", + "Volume": "2,116", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "University of Khartoum", + "Derived From": "nan", + "Paper Title": "Sentiment Analysis for Sudanese Arabic Dialect Using comparative Supervised Learning approach", + "Paper Link": "https://ieeexplore.ieee.org/document/9429560", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "ICCCEEE", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "2020 International Conference on Computer, Control, Electrical, and Electronics Engineering", + "Authors": "Shahad Abuuznien, Zena Abdelmohsin, Ehsan Abdu, Izzeldein Amin", + "Affiliations": "University of Khartoum", + "Abstract": "Sentiment analysis is several methods, techniques, and tools that are used to determine the polarity of the text (positive, negative, or neutral). The most popular approaches to address this problem, is the machine learning approach, lexicon-based approach, and hybrid approach. This project focuses on extracting and analyzing Sudanese social media feeds about ridesharing services. This project aims to tackle the issue of Sudanese Arabic dialect analysis by conducting a comparative analysis to measure the performance of the machine learning algorithms using Sudanese dialect corpus comparing different preprocessing approaches. For this study, a stop word list that combines a modern standard Arabic list and a Sudanese stop word list was built to be conducted through the analysis as one of the preprocessing steps. with four classifiers applied on a dataset consist of 2116 tweets. In particular, Na\u00efve Bayes (NB), Support vector machine (SVM), Logistic Regression, and K-Nearest Neighbor (KNN) had been trained and measured the performance. The results of the selected classifiers against the dataset which had been applied to various preprocessing steps revealed that SVM with stemming only gives the highest F1-score (0.71), and the best accuracy (0.95).", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/sudanese_dialect_tweets_about_telecommunication_companies.json b/datasets/sudanese_dialect_tweets_about_telecommunication_companies.json new file mode 100644 index 0000000..1cbf1d8 --- /dev/null +++ b/datasets/sudanese_dialect_tweets_about_telecommunication_companies.json @@ -0,0 +1,36 @@ +{ + "Name": "Sudanese Dialect tweets about telecommunication companies", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Sudanese_Dialect_Tweet_Tele", + "Link": "https://docs.google.com/spreadsheets/d/13fIV8oHss-QRBKN-2h5LYF1i_1O9qH1R/edit?usp=sharing&ouid=101796411348671465142&rtpof=true&sd=true", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Sentiment Analysis dataset written in Sudanese Arabic Dialect", + "Volume": "4,712", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "University of Khartoum", + "Derived From": "nan", + "Paper Title": "Sentiment Analysis for Arabic Dialect Using Supervised Learning", + "Paper Link": "https://ieeexplore.ieee.org/document/8515862", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "ICCCEEE", + "Citations": "8.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computer, Control, Electrical, and Electronics Engineering 2018", + "Authors": "Rua Ismail, Mawada Omer, Mawada Tabir, Noor Mahadi, Izzeldein Amin", + "Affiliations": "University of Khartoum", + "Abstract": "Sentiment analysis is a set of procedures used to extract subjective opinions from the text. Generally, there are two techniques for sentiment analysis, machine learning method, and lexicon-based method. This work focuses on extracting and analyzing Twitter data written in Sudanese Arabic dialect to observe opinionated patterns regarding the quality of telecommunication services operating in Sudan. One of the significant limitations in the field of text classification is the exclusive focus on the English language. There is a need to bridge this gap by developing efficient methods and tools for sentiment analysis in the Arabic language. Moreover, reliable corpus and lexicons are needed. For this study, four classifiers were trained on a dataset consist of 4712 tweets. Namely Na\u00efve Bayes, SVM, Multinomial Logistic Regression and K-Nearest Neighbor to conduct a comparative analysis on the performance of the classifiers. These algorithms when ran against the tweets dataset the results revealed that SVM gives the highest F1-score (72.0) while the best accuracy was achieved by KNN (k=2) and it equals to 92.0.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json b/datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json new file mode 100644 index 0000000..ad30669 --- /dev/null +++ b/datasets/sudannese_arabic_telcom_sentiment_classification_pre_processed.json @@ -0,0 +1,36 @@ +{ + "Name": "Sudannese Arabic Telcom Sentiment Classification Pre Processed ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://docs.google.com/spreadsheets/d/13fIV8oHss-QRBKN-2h5LYF1i_1O9qH1R/edit?usp=sharing&ouid=113975694262803649646&rtpof=true&sd=true", + "License": "MIT License", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-SD: (Arabic (Sudan))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "it is pre processed dataset from Twitter about Telecom companies in Sudan, it labelled by 3 different labels from different age, gender and background", + "Volume": "5,349", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "University of Khartoum", + "Derived From": "nan", + "Paper Title": "Sentiment analysis for arabic dialect using supervised learning", + "Paper Link": "https://ieeexplore.ieee.org/document/8515862", + "Script": "Latn", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "nan", + "Citations": "14.0", + "Venue Type": "conference", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Rua Ismail" +} \ No newline at end of file diff --git a/datasets/synonyms.json b/datasets/synonyms.json new file mode 100644 index 0000000..f093853 --- /dev/null +++ b/datasets/synonyms.json @@ -0,0 +1,36 @@ +{ + "Name": "Synonyms", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/synonyms/", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "A set of 500 synsets (extracted from the Arabic Wordnet). Each synset is enriched with a list of candidate synonyms. The total number is 3K candidates. ", + "Volume": "300,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "A Benchmark and Scoring Algorithm for Enriching Arabic Synonyms", + "Paper Link": "https://arxiv.org/abs/2302.02232", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, cross-lingual, information retrieval, part of speech tagging", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Sana Ghanem, Mustafa Jarrar, Radi Jarrar, Ibrahim Bounhas", + "Affiliations": "nan", + "Abstract": "This paper addresses the task of extending a given synset with additional synonyms taking into account synonymy strength as a fuzzy value. Given a mono/multilingual synset and a threshold (a fuzzy value [0 \u2212 1]), our goal is to extract new synonyms above this threshold from existing lexicons. We present twofold contributions: an algorithm and a benchmark dataset. The dataset consists of 3K candidate synonyms for 500 synsets. Each candidate synonym is annotated with a fuzzy value by four linguists. The dataset is important for (i) understanding how much linguists (dis/)agree on synonymy, in addition to (ii) using the dataset as a baseline to evaluate our algorithm. Our proposed algorithm extracts synonyms from existing lexicons and computes a fuzzy value for each candidate. Our evaluations show that the algorithm behaves like a linguist and its fuzzy values are close to those proposed by linguists (using RMSE and MAE). The dataset and a demo page are publicly available at https://portal.sina.birzeit.edu/synonyms. ", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/syria_tweets_sentiment_corpus.json b/datasets/syria_tweets_sentiment_corpus.json new file mode 100644 index 0000000..20e366c --- /dev/null +++ b/datasets/syria_tweets_sentiment_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Syria Tweets Sentiment Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Syria_Tweet_Sentiment", + "Link": "https://saifmohammad.com/WebPages/ArabicSA.html", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-SY: (Arabic (Syria))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A dataset of 2000 tweets originating from Syria", + "Volume": "2,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Sentiment after Translation: A Case-Study on Arabic Social Media Posts", + "Paper Link": "https://aclanthology.org/N15-1078.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis,machine translation", + "Venue Title": "NAACL", + "Citations": "125.0", + "Venue Type": "conference", + "Venue Name": "North American Chapter of the Association for Computational Linguistics", + "Authors": "Mohammad Salameh,Saif M. Mohammad,Svetlana Kiritchenko", + "Affiliations": ",National Research Council Canada,", + "Abstract": "When text is translated from one language into another, sentiment is preserved to varying degrees. In this paper, we use Arabic social media posts as stand-in for source language text, and determine loss in sentiment predictability when they are translated into English, manually and automatically. As benchmarks, we use manually and automatically determined sentiment labels of the Arabic texts. We show that sentiment analysis of English translations of Arabic texts produces competitive results, w.r.t. Arabic sentiment analysis. We discover that even though translation significantly reduces the human ability to recover sentiment, automatic sentiment systems are still able to capture sentiment information from the translations.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/talaa.json b/datasets/talaa.json new file mode 100644 index 0000000..3225104 --- /dev/null +++ b/datasets/talaa.json @@ -0,0 +1,36 @@ +{ + "Name": "TALAA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/TALAA", + "Link": "https://github.com/saidziani/Arabic-News-Article-Classification", + "License": "unknown", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "collections of articles ", + "Volume": "57,827", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "USTHB Algeria", + "Derived From": "nan", + "Paper Title": "Building TALAA, a Free General and Categorized Arabic Corpus \r", + "Paper Link": "https://www.scitepress.org/Papers/2015/53521/53521.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "topic classification", + "Venue Title": "ICAART", + "Citations": "4.0", + "Venue Type": "conference", + "Venue Name": "Conference on Agents and Artificial Intelligence", + "Authors": "Essma Selab,A. Guessoum", + "Affiliations": ",", + "Abstract": "Arabic natural language processing (ANLP) has gained increasing interest over the last decade. However, \n \n the development of ANLP tools depends on the availability of large corpora. It turns out unfortunately that \n \n the scientific community has a deficit in large and varied Arabic corpora, especially ones that are freely \n \n accessible. With the Internet continuing its exponential growth, Arabic Internet content has also been \n \n following the trend, yielding large amounts of textual data available through different Arabic websites. This \n \n paper describes the TALAA corpus, a voluminous general Arabic corpus, built from daily Arabic \n \n newspaper websites. The corpus is a collection of more than 14 million words with 15,891,729 tokens \n \n contained in 57,827 different articles. A part of the TALAA corpus has been tagged to construct an \n \n annotated Arabic corpus of about 7000 tokens, the POS-tagger used containing a set of 58 detailed tags. The \n \n annotated corpus was manually checked by two human experts. The methodology used to construct TALAA \n \n is presented and various metrics are applied to it, showing the usefulness of the corpus. The corpus can be \n \n made available to the scientific community upon authorisation.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/tanzil.json b/datasets/tanzil.json new file mode 100644 index 0000000..9ce44fb --- /dev/null +++ b/datasets/tanzil.json @@ -0,0 +1,36 @@ +{ + "Name": "Tanzil", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/tanzil", + "Link": "https://opus.nlpl.eu/Tanzil.php", + "License": "custom", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "This is a collection of Quran translations compiled by the Tanzil project", + "Volume": "12,472", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "Parallel Data, Tools and Interfaces in OPUS\r", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "1006.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "J. Tiedemann", + "Affiliations": "nan", + "Abstract": "This paper presents the current status of OPUS, a growing language resource of parallel corpora and related tools. The focus in OPUS is to provide freely available data sets in various formats together with basic annotation to be useful for applications in computational linguistics, translation studies and cross-linguistic corpus studies. In this paper, we report about new data sets and their features, additional annotation tools and models provided from the website and essential interfaces and on-line services included in the project.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tapaco.json b/datasets/tapaco.json new file mode 100644 index 0000000..55d51d4 --- /dev/null +++ b/datasets/tapaco.json @@ -0,0 +1,36 @@ +{ + "Name": "TaPaCo", + "Subsets": [], + "HF Link": "https://hf.co/datasets/community-datasets/tapaco", + "Link": "https://zenodo.org/record/3707949#.YRKu0ogzaUk", + "License": "CC BY 2.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The corpus contains a total of 1.9 million sentences, with 200 \u2013 250 000\r\nsentences per language", + "Volume": "6,446", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Helsinki\r", + "Derived From": "Tatoeba", + "Paper Title": "TaPaCo: A Corpus of Sentential Paraphrases for 73 Languages\r", + "Paper Link": "https://aclanthology.org/2020.lrec-1.848.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "paraphrasing", + "Venue Title": "LREC", + "Citations": "4.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Yves Scherrer", + "Affiliations": "nan", + "Abstract": "This paper presents TaPaCo, a freely available paraphrase corpus for 73 languages extracted from the Tatoeba database. Tatoeba is a crowdsourcing project mainly geared towards language learners. Its aim is to provide example sentences and translations for particular linguistic constructions and words. The paraphrase corpus is created by populating a graph with Tatoeba sentences and equivalence links between sentences \u201cmeaning the same thing\u201d. This graph is then traversed to extract sets of paraphrases. Several language-independent filters and pruning steps are applied to remove uninteresting sentences. A manual evaluation performed on three languages shows that between half and three quarters of inferred paraphrases are correct and that most remaining ones are either correct but trivial, or near-paraphrases that neutralize a morphological distinction. The corpus contains a total of 1.9 million sentences, with 200 - 250 000 sentences per language. It covers a range of languages for which, to our knowledge, no other paraphrase dataset exists. The dataset is available at https://doi.org/10.5281/zenodo.3707949.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tarc.json b/datasets/tarc.json new file mode 100644 index 0000000..05cf50b --- /dev/null +++ b/datasets/tarc.json @@ -0,0 +1,36 @@ +{ + "Name": "TArC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/TArC", + "Link": "https://github.com/eligugliotta/tarc", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "flexible and multi-purpose open corpus in order to be a useful support for different types of analyses: computational and linguistics, as well as for NLP tools training", + "Volume": "4,790", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Stanford University", + "Derived From": "nan", + "Paper Title": "TArC: Incrementally and Semi-Automatically Collecting a Tunisian\r\nArabish Corpus\r", + "Paper Link": "https://aclanthology.org/2020.lrec-1.770.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "transliteration", + "Venue Title": "LREC", + "Citations": "2.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Elisa Gugliotta,Marco Dinarelli", + "Affiliations": ",", + "Abstract": "This article describes the constitution process of the first morpho-syntactically annotated Tunisian Arabish Corpus (TArC). Arabish, also known as Arabizi, is a spontaneous coding of Arabic dialects in Latin characters and \u201carithmographs\u201d (numbers used as letters). This code-system was developed by Arabic-speaking users of social media in order to facilitate the writing in the Computer-Mediated Communication (CMC) and text messaging informal frameworks. Arabish differs for each Arabic dialect and each Arabish code-system is under-resourced, in the same way as most of the Arabic dialects. In the last few years, the attention of NLP studies on Arabic dialects has considerably increased. Taking this into consideration, TArC will be a useful support for different types of analyses, computational and linguistic, as well as for NLP tools training. In this article we will describe preliminary work on the TArC semi-automatic construction process and some of the first analyses we developed on TArC. In addition, in order to provide a complete overview of the challenges faced during the building process, we will present the main Tunisian dialect characteristics and its encoding in Tunisian Arabish.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/tashkeela.json b/datasets/tashkeela.json new file mode 100644 index 0000000..82bbaf1 --- /dev/null +++ b/datasets/tashkeela.json @@ -0,0 +1,55 @@ +{ + "Name": "Tashkeela", + "Subsets": [ + { + "Name": "Classical Arabic", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Volume": "74,762,008", + "Unit": "tokens" + }, + { + "Name": "Modern Standard Arabic", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "867,913", + "Unit": "tokens" + }, + { + "Name": "Manual", + "Dialect": "mixed", + "Volume": "7,701", + "Unit": "tokens" + } + ], + "HF Link": "https://hf.co/datasets/community-datasets/tashkeela", + "Link": "https://sourceforge.net/projects/tashkeela/", + "License": "GPL-2.0", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling", + "Description": "Arabic discritization Corpus, Resource, Arabic vocalized texts", + "Volume": "75,629,921", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "ESI", + "Derived From": "nan", + "Paper Title": "Tashkeela: Novel corpus of Arabic vocalized texts, data for auto-diacritization systems", + "Paper Link": "https://www.sciencedirect.com/science/article/pii/S2352340917300112", + "Script": "Arab", + "Tokenized": "No", + "Host": "sourceforge", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "diacritization", + "Venue Title": "Data in brief", + "Citations": "46.0", + "Venue Type": "journal", + "Venue Name": "Data in brief", + "Authors": "Taha Zerrouki,Amar Balla", + "Affiliations": ",", + "Abstract": "Arabic diacritics are often missed in Arabic scripts. This feature is a handicap for new learner to read \u064eArabic, text to speech conversion systems, reading and semantic analysis of Arabic texts. The automatic diacritization systems are the best solution to handle this issue. But such automation needs resources as diactritized texts to train and evaluate such systems. In this paper, we describe our corpus of Arabic diacritized texts. This corpus is called Tashkeela. It can be used as a linguistic resource tool for natural language processing such as automatic diacritics systems, dis-ambiguity mechanism, features and data extraction. The corpus is freely available, it contains 75 million of fully vocalized words mainly 97 books from classical and modern Arabic language. The corpus is collected from manually vocalized texts using web crawling process.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/tatoeba.json b/datasets/tatoeba.json new file mode 100644 index 0000000..c484fcf --- /dev/null +++ b/datasets/tatoeba.json @@ -0,0 +1,36 @@ +{ + "Name": "Tatoeba", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/tatoeba", + "Link": "https://tatoeba.org/en/downloads", + "License": "CC BY 2.0", + "Year": 2006, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "A crowd-sourced dataset of parallel sentences.", + "Volume": "62,836", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/tatoeba_translation_challenge.json b/datasets/tatoeba_translation_challenge.json new file mode 100644 index 0000000..d0b9f32 --- /dev/null +++ b/datasets/tatoeba_translation_challenge.json @@ -0,0 +1,36 @@ +{ + "Name": "Tatoeba Translation Challenge", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/tatoeba_mt", + "Link": "https://github.com/Helsinki-NLP/Tatoeba-Challenge/", + "License": "CC BY-NC-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The Tatoeba Translation Challenge is a multilingual data set of machine translation benchmarks derived from user-contributed translations collected by Tatoeba.org and provided as parallel corpus from OPUS. This dataset includes test and development data sorted by language pair. It includes test sets for hundreds of language pairs and is continuously updated. Please, check the version number tag to refer to the release that your are using.", + "Volume": "1,064,096,596", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": " Language Technology at the University of Helsinki", + "Derived From": "Tatoeba", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tdt4_multilingual_broadcast_news_speech_corpus.json b/datasets/tdt4_multilingual_broadcast_news_speech_corpus.json new file mode 100644 index 0000000..6d00b77 --- /dev/null +++ b/datasets/tdt4_multilingual_broadcast_news_speech_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "TDT4 Multilingual Broadcast News Speech Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005S11", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "The TDT4 corpus contains news data collected daily from 20 news sources (13 broadcast, seven newswire) in three languages (American English, Mandarin Chinese, and Modern Standard Arabic), over a period of four months (October 2000 through January 2001). Here's a breakdown of the broadcast data included in this release with number of files and time length by source:", + "Volume": "88.3", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "19,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tdt4_multilingual_text_and_annotations.json b/datasets/tdt4_multilingual_text_and_annotations.json new file mode 100644 index 0000000..09d8cc9 --- /dev/null +++ b/datasets/tdt4_multilingual_text_and_annotations.json @@ -0,0 +1,36 @@ +{ + "Name": "TDT4 Multilingual Text and Annotations", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2005T16", + "License": "LDC User Agreement for Non-Members", + "Year": 2005, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "The TDT4 corpus contains news data collected daily from 20 news sources in three languages over a period of four months (October 2000 through January 2001).", + "Volume": "100,000", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "2,000.00 $", + "Test Split": "No", + "Tasks": "topic detection and tracking", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tdt5_multilingual_text.json b/datasets/tdt5_multilingual_text.json new file mode 100644 index 0000000..1ba62ae --- /dev/null +++ b/datasets/tdt5_multilingual_text.json @@ -0,0 +1,36 @@ +{ + "Name": "TDT5 Multilingual Text", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T18", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "The TDT5 corpus spans collections from April-September 2003 of English, Chinese, and Arabic news text. A total of 15 distinct news \"sources\" are included (where a \"source\" comprises data from a given news agency in a particular language; when an agency publishes in multiple languages, each language is considered a different \"source\"). In contrast to earlier TDT corpora, TDT5 has no broadcast/audio content, only printed news from wire and web sources.", + "Volume": "407,503", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,500.00 $", + "Test Split": "No", + "Tasks": "topic classification, language modeling, generation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tdt5_topics_and_annotations.json b/datasets/tdt5_topics_and_annotations.json new file mode 100644 index 0000000..f02b834 --- /dev/null +++ b/datasets/tdt5_topics_and_annotations.json @@ -0,0 +1,36 @@ +{ + "Name": "TDT5 Topics and Annotations", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2006T19", + "License": "LDC User Agreement for Non-Members", + "Year": 2006, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "A total of 250 topics, numbered 55001 - 55250, were annotated by LDC using a search guided annotation technique. Details of the annotation process are described in the annotation task definition.", + "Volume": "104", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "information detection,information extraction,language modeling,machine translation,topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tead.json b/datasets/tead.json new file mode 100644 index 0000000..87a0523 --- /dev/null +++ b/datasets/tead.json @@ -0,0 +1,36 @@ +{ + "Name": "TEAD", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/TEAD", + "Link": "https://github.com/HSMAabdellaoui/TEAD", + "License": "GPL-3.0", + "Year": 2018, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "dataset for Arabic Sentiment Analysis", + "Volume": "6,000,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Using Tweets and Emojis to Build TEAD: an Arabic Dataset for Sentiment Analysis", + "Paper Link": "https://www.researchgate.net/publication/328105014_Using_Tweets_and_Emojis_to_Build_TEAD_an_Arabic_Dataset_for_Sentiment_Analysis", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "Computaci\u00f3n y Sistemas", + "Citations": "17.0", + "Venue Type": "journal", + "Venue Name": "Computaci\u00f3n y Sistemas", + "Authors": "Houssem Abdellaoui,M. Zrigui", + "Affiliations": ",", + "Abstract": "Our paper presents a distant supervision algorithm for automatically collecting and labeling \u2018TEAD\u2019, a dataset for Arabic Sentiment Analysis (SA), using emojis and sentiment lexicons. The data was gathered from Twitter during the period between the 1st of June and the 30th of November 2017. Although the idea of using emojis to collect and label training data for SA, is not novel, getting this approach to work for Arabic dialect was very challenging. We ended up with more than 6 million tweets labeled as Positive, Negative or Neutral. We present the algorithm used to deal with mixed-content tweets (Modern Standard Arabic MSA and Dialect Arabic DA). We also provide properties and statistics of the dataset along side experiments results. Our try outs covered a wide range of standard classifiers proved to be efficient for sentiment classification problem.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/ted_talks_corpus_(wit3).json b/datasets/ted_talks_corpus_(wit3).json new file mode 100644 index 0000000..622d5db --- /dev/null +++ b/datasets/ted_talks_corpus_(wit3).json @@ -0,0 +1,36 @@ +{ + "Name": "TED Talks Corpus (WIT3)", + "Subsets": [], + "HF Link": "nan", + "Link": "https://wit3.fbk.eu/", + "License": "CC BY-NC-ND 4.0", + "Year": 2012, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": " The TED Talks corpus contains transcribed and translated TED Talks from 82+ languages. It is curated for use in machine translation and natural language processing research, providing a valuable multilingual parallel corpus.", + "Volume": "17,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Trento, Italy ", + "Derived From": "TED Conference website", + "Paper Title": "WIT3: Web Inventory of Transcribed and Translated Talks", + "Paper Link": "https://www.aclweb.org/anthology/2012.eamt-1.60.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation, speech recognition, language modeling, text generation", + "Venue Title": "EAMT", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "European Association for Machine Translation (EAMT)", + "Authors": "Mauro Cettolo, Christian Girardi, Marcello Federico", + "Affiliations": "FBK \u2013 Fondazione Bruno Kessler, Trento, Italy", + "Abstract": "WIT3 is a web-based inventory that provides access to the TED Talks corpus. It includes transcribed and translated talks in over 80 languages, making it a valuable resource for machine translation research. The corpus has been processed to be more convenient for researchers, providing benchmarks and tools for handling multilingual parallel texts.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/ted_talks_iwslt.json b/datasets/ted_talks_iwslt.json new file mode 100644 index 0000000..3569b67 --- /dev/null +++ b/datasets/ted_talks_iwslt.json @@ -0,0 +1,36 @@ +{ + "Name": "TED TALKS IWSLT", + "Subsets": [], + "HF Link": "https://hf.co/datasets/IWSLT/ted_talks_iwslt", + "Link": "https://drive.google.com/u/0/uc?id=1Cz1Un9p8Xn9IpEMMrg2kXSDt0dnjxc4z&export=download", + "License": "CC BY-NC 4.0", + "Year": 2012, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "crawling", + "Description": "The Web Inventory Talk is a collection of the original Ted talks and their translated version. The translations are available in more than 109+ languages, though the distribution is not uniform.", + "Volume": "19,670,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Fondazione Bruno Kessler", + "Derived From": "TED", + "Paper Title": "WIT3 : Web Inventory of Transcribed and Translated Talks", + "Paper Link": "https://aclanthology.org/2012.eamt-1.60.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "speech recognition", + "Venue Title": "EAMT", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "European Association for Machine Translation", + "Authors": "Mauro Cettolo, Christian Girardi, Marcello Federico", + "Affiliations": "FBK \u2013 Fondazione Bruno Kessler Trento, Italy", + "Abstract": "We describe here a Web inventory named\nWIT3\nthat offers access to a collection of\ntranscribed and translated talks. The core\nof WIT3\nis the TED Talks corpus, that\nbasically redistributes the original content\npublished by the TED Conference website (http://www.ted.com). Since 2007,\nthe TED Conference, based in California,\nhas been posting all video recordings of\nits talks together with subtitles in English\nand their translations in more than 80 languages. Aside from its cultural and social relevance, this content, which is published under the Creative Commons BYNC-ND license, also represents a precious\nlanguage resource for the machine translation research community, thanks to its size,\nvariety of topics, and covered languages.\nThis effort repurposes the original content\nin a way which is more convenient for machine translation researchers.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/the_arabic_e-book_corpus.json b/datasets/the_arabic_e-book_corpus.json new file mode 100644 index 0000000..2c6967f --- /dev/null +++ b/datasets/the_arabic_e-book_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "The Arabic E-Book Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/arabic_ebook_corpus", + "Link": "https://snd.se/en/catalogue/dataset/preview/eed46fe0-dfeb-442b-8a71-74d952e006c2/1#", + "License": "CC BY 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "books", + "Form": "text", + "Collection Style": "crawling", + "Description": "The Arabic E-Book Corpus is a freely available collection of 1,745 books (81.5 million words) published in by the Hindawi foundation between 2008 and 2024. The books are of various genres, including non-fiction, novels, children's literature, poetry, and plays. The corpus is provided in two versions: html and unformatted plain text. The latter version will be appropriate for most purposes.", + "Volume": "81,500,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, topic classification, language identification, text classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/the_arabic_speech_corpus_for_isolated_words.json b/datasets/the_arabic_speech_corpus_for_isolated_words.json new file mode 100644 index 0000000..e79216a --- /dev/null +++ b/datasets/the_arabic_speech_corpus_for_isolated_words.json @@ -0,0 +1,36 @@ +{ + "Name": "The Arabic Speech Corpus for Isolated Words", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Speech_Corpus_for_Isolated_Words", + "Link": "https://www.cs.stir.ac.uk/~lss/arabic/", + "License": "unknown", + "Year": 2014, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "The Arabic speech corpus for isolated words contains 9992 utterances of 20 words spoken by 50 native male Arabic speakers. It has been recorded with a 44100 Hz sampling rate and 16-bit resolution. This corpus is free for noncommercial uses in the raw format (.wav files) and other formats e.g. (MFCCs) are available under request.", + "Volume": "9,992", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "University of Stirling", + "Derived From": "nan", + "Paper Title": "On Improving the Classification Capability of Reservoir Computing for Arabic Speech Recognition", + "Paper Link": "https://www.cs.stir.ac.uk/~lss/recentpapers/icann2014AlalshekmubarakSmith.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Abdulrahman Alalshekmubarak, Leslie S. Smith", + "Affiliations": "University of Stirling", + "Abstract": "Designing noise-resilient systems is a major challenge in the\nfield of automated speech recognition (ASR). These systems are crucial\nfor real-world applications where high levels of noise tend to be present.\nWe introduce a noise robust system based on Echo State Networks and\nExtreme Kernel machines which we call ESNEKM. To evaluate the performance of the proposed system, we used our recently released public\nArabic speech dataset and the well-known spoken Arabic digits (SAD)\ndataset. Different feature extraction methods considered in this study\ninclude mel-frequency cepstral coefficients (MFCCs), perceptual linear\nprediction (PLP) and RASTA- perceptual linear prediction. These extracted features were fed to the ESNEKM and the result compared with\na baseline hidden Markov model (HMM), so that nine models were compared in total. ESNEKM models outperformed HMM models under all\nthe feature extraction methods, noise levels, and noise types. The best\nperformance was obtained by the model that combined RASTA-PLP\nwith ESNEKM.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/the_nine_books_of_arabic_hadith.json b/datasets/the_nine_books_of_arabic_hadith.json new file mode 100644 index 0000000..8e238c4 --- /dev/null +++ b/datasets/the_nine_books_of_arabic_hadith.json @@ -0,0 +1,36 @@ +{ + "Name": "The Nine Books Of Arabic Hadith", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Hadith", + "Link": "https://github.com/abdelrahmaan/Hadith-Data-Sets", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "There are two files of Hadith, the first one for all hadith With Tashkil and Without Tashkel from the Nine Books that are 62,169 Hadith. The second one it Hadith pre-processing data, which is applyed normalization and removeing stop words and lemmatization on it\n\n", + "Volume": "62,169", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text classification,\ntext Similarity", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Abdulrahman Kamar" +} \ No newline at end of file diff --git a/datasets/the_sadid_evaluation_datasets.json b/datasets/the_sadid_evaluation_datasets.json new file mode 100644 index 0000000..a88bffa --- /dev/null +++ b/datasets/the_sadid_evaluation_datasets.json @@ -0,0 +1,61 @@ +{ + "Name": "The SADID Evaluation Datasets ", + "Subsets": [ + { + "Name": "Levantine", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Volume": "8,988", + "Unit": "sentences" + }, + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "8,988", + "Unit": "sentences" + }, + { + "Name": "MSA", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "2,994", + "Unit": "sentences" + }, + { + "Name": "English", + "Dialect": "mixed", + "Volume": "8,994", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/SADID", + "Link": "https://github.com/we7el/SADID", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Evaluation Datasets for Low-Resource Spoken Language Machine Translation of Arabic Dialects", + "Volume": "29,964", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Stanford University", + "Derived From": "Contains curated data and data from the following corpus (LDC2012T09, LDC2019T01, LDC2019T18, LDC2020T05, LDC2012T09)", + "Paper Title": "The SADID Evaluation Datasets for Low-Resource Spoken Language\r\nMachine Translation of Arabic Dialects", + "Paper Link": "https://aclanthology.org/2020.coling-main.530.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "COLING", + "Citations": "0.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Computational Linguistics", + "Authors": "Wael Abid", + "Affiliations": "nan", + "Abstract": "Low-resource Machine Translation recently gained a lot of popularity, and for certain languages, it has made great strides. However, it is still difficult to track progress in other languages for which there is no publicly available evaluation data. In this paper, we introduce benchmark datasets for Arabic and its dialects. We describe our design process and motivations and analyze the datasets to understand their resulting properties. Numerous successful attempts use large monolingual corpora to augment low-resource pairs. We try to approach augmentation differently and investigate whether it is possible to improve MT models without any external sources of data. We accomplish this by bootstrapping existing parallel sentences and complement this with multilingual training to achieve strong baselines.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/tides_extraction_(ace)_2003_multilingual_training_data.json b/datasets/tides_extraction_(ace)_2003_multilingual_training_data.json new file mode 100644 index 0000000..7ef5226 --- /dev/null +++ b/datasets/tides_extraction_(ace)_2003_multilingual_training_data.json @@ -0,0 +1,36 @@ +{ + "Name": "TIDES Extraction (ACE) 2003 Multilingual Training Data", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2004T09", + "License": "LDC User Agreement for Non-Members", + "Year": 2004, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "Annotations for this corpus were produced by Linguistic Data Consortium to support the following tasks broken down by language:", + "Volume": "42,197", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "3,000.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/toxi-text-3m.json b/datasets/toxi-text-3m.json new file mode 100644 index 0000000..b1e0877 --- /dev/null +++ b/datasets/toxi-text-3m.json @@ -0,0 +1,36 @@ +{ + "Name": "toxi-text-3M", + "Subsets": [], + "HF Link": "https://hf.co/datasets/FredZhang7/toxi-text-3M", + "Link": "https://hf.co/datasets/FredZhang7/toxi-text-3M", + "License": "Apache-2.0", + "Year": 2023, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "This is a large multilingual toxicity dataset with 3M rows of text data from 55 natural languages, all of which are written/sent by humans, not machine translation models.", + "Volume": "51,852", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "toxicity detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json new file mode 100644 index 0000000..37a5743 --- /dev/null +++ b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_development_set.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-English Mailing lists Parallel corpus - Development set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0108/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "This is a parallel corpus of 10,000 words in Arabic and a reference translation in English. The source texts are emails collected from Wikiar-I, a mailing list for discussions about the Arabic Wikipedia. The collected emails are dated from 2004 to 2007. The translation has been conducted following a strict protocol aimed at producing high quality translations.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json new file mode 100644 index 0000000..2cfecd7 --- /dev/null +++ b/datasets/trad_arabic-english_mailing_lists_parallel_corpus_-_test_set.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-English Mailing lists Parallel corpus - Test set ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0106/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "This is a parallel corpus of 10,000 words in Arabic and 2 reference translations in English. The source texts are emails collected from Wikiar-I, a mailing list for discussions about the Arabic Wikipedia. The collected emails are dated from 2010 to 2012. The translation has been conducted by two different translation teams following a strict protocol aimed at producing high quality translations.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json b/datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json new file mode 100644 index 0000000..a8d4086 --- /dev/null +++ b/datasets/trad_arabic-english_newspaper_parallel_corpus_-_test_set_1.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-English Newspaper Parallel corpus - Test set 1", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0099/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "This is a parallel corpus of 10,000 words in Arabic and 2 reference translations in English. The source texts are articles collected in 2012 from the Arabic version of Le Monde Diplomatique. The translation has been conducted by two different translation teams following a strict protocol aimed at producing high quality translations.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-english_parallel_corpus_of_transcribed_broadcast_news_speech.json b/datasets/trad_arabic-english_parallel_corpus_of_transcribed_broadcast_news_speech.json new file mode 100644 index 0000000..363e197 --- /dev/null +++ b/datasets/trad_arabic-english_parallel_corpus_of_transcribed_broadcast_news_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-English Parallel corpus of transcribed Broadcast News Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0102/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "other", + "Description": "This is a parallel corpus of 10,000 words in Arabic and 2 reference translations in English. The source texts are transcriptions of broadcast news in Arabic recorded on France 24. The translation has been conducted by two different translation teams following a strict protocol aimed at producing high quality translations.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation, speech recognition ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-english_web_domain_(blogs)_parallel_corpus.json b/datasets/trad_arabic-english_web_domain_(blogs)_parallel_corpus.json new file mode 100644 index 0000000..6a79bed --- /dev/null +++ b/datasets/trad_arabic-english_web_domain_(blogs)_parallel_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-English Web domain (blogs) Parallel corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0104/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This is a parallel corpus of 10,000 words in Arabic and 2 reference translations in English. The source texts are blog articles written between 2008 and 2013. The translation has been conducted by two different translation teams following a strict protocol aimed at producing high quality translations.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_development_set.json b/datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_development_set.json new file mode 100644 index 0000000..cd17c80 --- /dev/null +++ b/datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_development_set.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-French Mailing lists Parallel corpus - Development set", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0107/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is a parallel corpus of 10,000 words in Arabic and a reference translation in French. The source texts are emails collected from Wikiar-I, a mailing list for discussions about the Arabic Wikipedia.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_test_set.json b/datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_test_set.json new file mode 100644 index 0000000..473bdd5 --- /dev/null +++ b/datasets/trad_arabic-french_mailing_lists_parallel_corpus_-_test_set.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-French Mailing lists Parallel corpus - Test set ", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalogue.elra.info/en-us/repository/browse/ELRA-W0105/", + "License": "Non Commercial Use - ELRA END USER", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is a parallel corpus of 10,000 words in Arabic and 4 reference translations in French. The source texts are emails collected from Wikiar-I, a mailing list for discussions about the Arabic Wikipedia.", + "Volume": "10,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "ELRA", + "Access": "With-Fee", + "Cost": "300.00\u20ac", + "Test Split": "No", + "Tasks": "machine translation ", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-french_parallel_text_--_newsgroup.json b/datasets/trad_arabic-french_parallel_text_--_newsgroup.json new file mode 100644 index 0000000..90e5bcc --- /dev/null +++ b/datasets/trad_arabic-french_parallel_text_--_newsgroup.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-French Parallel Text -- Newsgroup", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018T13", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of 398 segments (translation units) from 17 documents. The source data is Arabic newsgroup text collected and translated into English by the Linguistic Data Consortium for the DARPA GALE (Global Autonomous Language Exploitation) program. Information about the ELDA translation team, translation guidelines and validation results is contained in the documentation accompanying this release.", + "Volume": "398", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "300.00 $", + "Test Split": "No", + "Tasks": "language modeling,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trad_arabic-french_parallel_text_--_newswire.json b/datasets/trad_arabic-french_parallel_text_--_newswire.json new file mode 100644 index 0000000..4f6426f --- /dev/null +++ b/datasets/trad_arabic-french_parallel_text_--_newswire.json @@ -0,0 +1,36 @@ +{ + "Name": "TRAD Arabic-French Parallel Text -- Newswire", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2018T21", + "License": "LDC User Agreement for Non-Members", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "other", + "Description": "This release consists of 813 segments (translations units) from 74 documents. The source data is Arabic newswire text collected and translated into English by LDC. Information about the ELDA translation team, translation guidelines and validation results is contained in the documentation accompanying this release.", + "Volume": "813", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "nan", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "350.00 $", + "Test Split": "No", + "Tasks": "language modeling,machine translation", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/transliteration.json b/datasets/transliteration.json new file mode 100644 index 0000000..7b221a3 --- /dev/null +++ b/datasets/transliteration.json @@ -0,0 +1,36 @@ +{ + "Name": "Transliteration", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/google_transliteration", + "Link": "https://github.com/google/transliteration", + "License": "Apache-2.0", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "Arabic-English transliteration dataset mined from Wikipedia.", + "Volume": "15,898", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Google", + "Derived From": "nan", + "Paper Title": "Sequence-to-sequence neural network models for transliteration", + "Paper Link": "https://arxiv.org/pdf/1610.09565.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "transliteration", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Mihaela Rosca, Thomas Breuel", + "Affiliations": "Google", + "Abstract": "Transliteration is a key component of machine\ntranslation systems and software internationalization. This paper demonstrates that neural\nsequence-to-sequence models obtain state of\nthe art or close to state of the art results on existing datasets. In an effort to make machine\ntransliteration accessible, we open source a\nnew Arabic to English transliteration dataset\nand our trained models.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trecvid_2005_keyframes_&_transcripts.json b/datasets/trecvid_2005_keyframes_&_transcripts.json new file mode 100644 index 0000000..41ee713 --- /dev/null +++ b/datasets/trecvid_2005_keyframes_&_transcripts.json @@ -0,0 +1,36 @@ +{ + "Name": "TRECVID 2005 Keyframes & Transcripts", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2007V01", + "License": "LDC User Agreement for Non-Members", + "Year": 2007, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "other", + "Collection Style": "other", + "Description": "The source data is Arabic, Chinese and English language broadcast programming collected in November 2004 from the following sources: Lebanese Broadcasting Corp. (Arabic); China Central TV and New Tang Dynasty TV (Chinese); and CNN and MSNBC/NBC (English). ", + "Volume": "nan", + "Unit": "nan", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "information retrieval,information extraction,event detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/trecvid_2006_keyframes.json b/datasets/trecvid_2006_keyframes.json new file mode 100644 index 0000000..9afd30d --- /dev/null +++ b/datasets/trecvid_2006_keyframes.json @@ -0,0 +1,36 @@ +{ + "Name": "TRECVID 2006 Keyframes", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2010V02", + "License": "LDC User Agreement for Non-Members", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "other", + "Collection Style": "other", + "Description": " The video stills that compose this corpus are drawn from approximately 158.6 hours of English, Arabic, and Chinese language broadcast programming data collected by LDC from NBC (\"NBC Nightly News\"), CNN (\"Live From..\", \"Anderson Cooper 360\"), MSNBC (\"MSNBC News live\"), New Tang Dynsaty TV (\"Economic Frontier\", \"Focus Interactive\"), Phoenix TV (\"Good Morning China\"), Lebanese Broadcasting Corp. (\"Naharkum Saiid\", \"News on LBC\"), Alhurra TV (\"Alhurra News\") and China Central TV (\"CCTV_News\"). ", + "Volume": "158.6", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "500.00 $", + "Test Split": "No", + "Tasks": "information extraction,event detection,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/troll_detection.json b/datasets/troll_detection.json new file mode 100644 index 0000000..7459625 --- /dev/null +++ b/datasets/troll_detection.json @@ -0,0 +1,36 @@ +{ + "Name": "Troll Detection", + "Subsets": [], + "HF Link": "nan", + "Link": "https://www.dropbox.com/s/hqab7kp2zyex01h/Trolls%20Dataset.zip?dl=0", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Trolls detection in Tweets", + "Volume": "128", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Dropbox", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "trolls detection", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tsac.json b/datasets/tsac.json new file mode 100644 index 0000000..ce0c4ba --- /dev/null +++ b/datasets/tsac.json @@ -0,0 +1,36 @@ +{ + "Name": "TSAC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/TSAC", + "Link": "https://github.com/fbougares/TSAC", + "License": "LGPL-3.0", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "About 17k user comments manually annotated to positive and negative polarities. This corpus is collected from Facebook users comments written on official pages of Tunisian radios and TV channels", + "Volume": "17,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "Vienna", + "Derived From": "nan", + "Paper Title": "Sentiment Analysis of Tunisian Dialect:\nLinguistic Resources and Experiments", + "Paper Link": "https://aclanthology.org/W17-1307.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "sentiment analysis", + "Venue Title": "WANLP", + "Citations": "59.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Salima Medhaffar,Fethi Bougares,Y. Est\u00e8ve,L. Belguith", + "Affiliations": ",,,", + "Abstract": "Dialectal Arabic (DA) is significantly different from the Arabic language taught in schools and used in written communication and formal speech (broadcast news, religion, politics, etc.). There are many existing researches in the field of Arabic language Sentiment Analysis (SA); however, they are generally restricted to Modern Standard Arabic (MSA) or some dialects of economic or political interest. In this paper we are interested in the SA of the Tunisian Dialect. We utilize Machine Learning techniques to determine the polarity of comments written in Tunisian Dialect. First, we evaluate the SA systems performances with models trained using freely available MSA and Multi-dialectal data sets. We then collect and annotate a Tunisian Dialect corpus of 17.000 comments from Facebook. This corpus allows us a significant accuracy improvement compared to the best model trained on other Arabic dialects or MSA data. We believe that this first freely available corpus will be valuable to researchers working in the field of Tunisian Sentiment Analysis and similar areas.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/tudicoi.json b/datasets/tudicoi.json new file mode 100644 index 0000000..13e1e5b --- /dev/null +++ b/datasets/tudicoi.json @@ -0,0 +1,36 @@ +{ + "Name": "TuDiCoI", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/TuDiCoI", + "Link": "https://sites.google.com/site/anlprg/outils-et-corpus-realisess", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "The corpus consists of 434 1465 staff utterances and 1615 client utterances", + "Volume": "127", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Sfax", + "Derived From": "nan", + "Paper Title": "LEXICAL STUDY OF A SPOKEN DIALOGUE CORPUS IN TUNISIAN\nDIALECT", + "Paper Link": "https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.452.7847&rep=rep1&type=pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "other", + "Citations": "15.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "M. Graja,M. Jaoua,L. Belguith", + "Affiliations": ",,", + "Abstract": "The aim of this paper is to present a lexical study of a spoken dialogue corpus in Tunisian dialect since such resources does not currently existing. The lexical analysis permits to take into account the specificity of Tunisian dialect by identifying lexical varieties and significant elements used in the spoken dialogue. This can lead us to provide a useful characterization for dialogue systems and help us to develop models and methods specifically designed for Tunisian dialect.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tufs_media.json b/datasets/tufs_media.json new file mode 100644 index 0000000..33be062 --- /dev/null +++ b/datasets/tufs_media.json @@ -0,0 +1,36 @@ +{ + "Name": "TUFS Media", + "Subsets": [], + "HF Link": "nan", + "Link": "http://ngc2068.tufs.ac.jp/tufsmedia-corpus/", + "License": "unknown", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "a parallel corpus of translated news articles collected at Tokyo University of Foreign Studies (TUFS)", + "Volume": "8,652", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Tokyo University", + "Derived From": "nan", + "Paper Title": "A Parallel Corpus of Arabic\u2013Japanese News Articles\n", + "Paper Link": "https://aclanthology.org/L18-1147.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "9.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Go Inoue,Nizar Habash,Yuji Matsumoto,Hiroyuki Aoyama", + "Affiliations": "New York University;New York University Abu Dhabi,,,", + "Abstract": "Much work has been done on machine translation between major language pairs including Arabic\u2013English and English\u2013Japanese thanks to the availability of large-scale parallel corpora with manually verified subsets of parallel sentences. However, there has been little research conducted on the Arabic\u2013Japanese language pair due to its parallel-data scarcity, despite being a good example of interestingly contrasting differences in typology. In this paper, we describe the creation process and statistics of the Arabic\u2013Japanese portion of the TUFS Media Corpus, a parallel corpus of translated news articles collected at Tokyo University of Foreign Studies (TUFS). Part of the corpus is manually aligned at the sentence level for development and testing. The corpus is provided in two formats: A document-level parallel corpus in XML format, and a sentence-level parallel corpus in plain text format. We also report the first results of Arabic\u2013 Japanese phrase-based machine translation trained on our corpus.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/tunisian_arabic_corpus.json b/datasets/tunisian_arabic_corpus.json new file mode 100644 index 0000000..7734d8d --- /dev/null +++ b/datasets/tunisian_arabic_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "Tunisian Arabic Corpus", + "Subsets": [], + "HF Link": "nan", + "Link": "http://www.tunisiya.org/", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Domain": "web pages", + "Form": "text", + "Collection Style": "crawling", + "Description": "There are currently 2,874 texts in the corpus, comprising 1,088,614 words. The top categories currently included are displayed below. As you can see, the internet sources are currently dominant", + "Volume": "2,874", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Tunisian Arabic Corpus: Creating a written corpus of an \" unwritten \" language", + "Paper Link": "https://www.academia.edu/28966672/Tunisian_Arabic_Corpus_Creating_a_written_corpus_of_an_unwritten_language", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "morphological analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Karen McNeil", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/tunizi.json b/datasets/tunizi.json new file mode 100644 index 0000000..af43f7a --- /dev/null +++ b/datasets/tunizi.json @@ -0,0 +1,36 @@ +{ + "Name": "TUNIZI", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/TUNIZI", + "Link": "https://github.com/chaymafourati/TUNIZI-Sentiment-Analysis-Tunisian-Arabizi-Dataset", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "first Tunisian Arabizi Dataset including 3K sentences, balanced, covering different topics, preprocessed and annotated as positive and negative", + "Volume": "3,000", + "Unit": "sentences", + "Ethical Risks": "Medium", + "Provider": "iCompass", + "Derived From": "nan", + "Paper Title": "TUNIZI: A TUNISIAN ARABIZI SENTIMENT ANALYSIS\r\nDATASET", + "Paper Link": "https://arxiv.org/pdf/2004.14303.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis", + "Venue Title": "ArXiv", + "Citations": "8.0", + "Venue Type": "preprint", + "Venue Name": "ArXiv", + "Authors": "Chayma Fourati,Abir Messaoudi,Hatem Haddad", + "Affiliations": ",,iCompass", + "Abstract": "On social media, Arabic people tend to express themselves in their own local dialects. More particularly, Tunisians use the informal way called \"Tunisian Arabizi\". Analytical studies seek to explore and recognize online opinions aiming to exploit them for planning and prediction purposes such as measuring the customer satisfaction and establishing sales and marketing strategies. However, analytical studies based on Deep Learning are data hungry. On the other hand, African languages and dialects are considered low resource languages. For instance, to the best of our knowledge, no annotated Tunisian Arabizi dataset exists. In this paper, we introduce TUNIZI a sentiment analysis Tunisian Arabizi Dataset, collected from social networks, preprocessed for analytical studies and annotated manually by Tunisian native speakers.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/twifil.json b/datasets/twifil.json new file mode 100644 index 0000000..2d45300 --- /dev/null +++ b/datasets/twifil.json @@ -0,0 +1,36 @@ +{ + "Name": "Twifil ", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Twifil", + "Link": "https://github.com/kinmokusu/oea_algd", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "An Algerian dialect dataset annotated for both sentiment (9,000 tweets), emotion (about 5,000 tweets) and extra-linguistic information including author profiling (age and gender)", + "Volume": "14,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "An Algerian Corpus and an Annotation Platform for Opinion and Emotion Analysis", + "Paper Link": "https://aclanthology.org/2020.lrec-1.151.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "sentiment analysis, emotion detection", + "Venue Title": "LREC", + "Citations": "14.0", + "Venue Type": "conference", + "Venue Name": "Language Resources and Evaluation Conference", + "Authors": "Leila Moudjari, Karima Akli-Astouati, Farah Benamara", + "Affiliations": "nan", + "Abstract": "In this paper, we address the lack of resources for opinion and emotion analysis related to North African dialects, targeting Algerian dialect. We present TWIFIL (TWItter proFILing) a collaborative annotation platform for crowdsourcing annotation of tweets at different levels of granularity. The plateform allowed the creation of the largest Algerian dialect dataset annotated for both sentiment (9,000 tweets), emotion (about 5,000 tweets) and extra-linguistic information including author profiling (age and gender). The annotation resulted also in the creation of the largest Algerien dialect subjectivity lexicon of about 9,000 entries which can constitute a valuable resources for the development of future NLP applications for Algerian dialect. To test the validity of the dataset, a set of deep learning experiments were conducted to classify a given tweet as positive, negative or neutral. We discuss our results and provide an error analysis to better identify classification errors.", + "Added By": "Abderrahmane Issam" +} \ No newline at end of file diff --git a/datasets/twt15da_lists.json b/datasets/twt15da_lists.json new file mode 100644 index 0000000..7733677 --- /dev/null +++ b/datasets/twt15da_lists.json @@ -0,0 +1,127 @@ +{ + "Name": "Twt15DA_Lists", + "Subsets": [ + { + "Name": "Yemeni", + "Dialect": "ar-YE: (Arabic (Yemen))", + "Volume": "20,004", + "Unit": "sentences" + }, + { + "Name": "Omani", + "Dialect": "ar-OM: (Arabic (Oman))", + "Volume": "20,861", + "Unit": "sentences" + }, + { + "Name": "Saudi", + "Dialect": "ar-SA: (Arabic (Saudi Arabia))", + "Volume": "21,110", + "Unit": "sentences" + }, + { + "Name": "Emirati", + "Dialect": "ar-AE: (Arabic (United Arab Emirates))", + "Volume": "20,957", + "Unit": "sentences" + }, + { + "Name": "Qatari", + "Dialect": "ar-QA: (Arabic (Qatar))", + "Volume": "22,160", + "Unit": "sentences" + }, + { + "Name": "Bahraini", + "Dialect": "ar-BH: (Arabic (Bahrain))", + "Volume": "22,160", + "Unit": "sentences" + }, + { + "Name": "Kuwaiti", + "Dialect": "ar-KW: (Arabic (Kuwait))", + "Volume": "20,338", + "Unit": "sentences" + }, + { + "Name": "Iraqi", + "Dialect": "ar-IQ: (Arabic (Iraq))", + "Volume": "20,241", + "Unit": "sentences" + }, + { + "Name": "Jordanian", + "Dialect": "ar-JO: (Arabic (Jordan))", + "Volume": "19,762", + "Unit": "sentences" + }, + { + "Name": "Syrian", + "Dialect": "ar-SY: (Arabic (Syria))", + "Volume": "18,750", + "Unit": "sentences" + }, + { + "Name": "Egyptian", + "Dialect": "ar-EG: (Arabic (Egypt))", + "Volume": "20,109", + "Unit": "sentences" + }, + { + "Name": "Libyan", + "Dialect": "ar-LY: (Arabic (Libya))", + "Volume": "22,844", + "Unit": "sentences" + }, + { + "Name": "Tunisian", + "Dialect": "ar-TN: (Arabic (Tunisia))", + "Volume": "21,440", + "Unit": "sentences" + }, + { + "Name": "Algerian", + "Dialect": "ar-DZ: (Arabic (Algeria))", + "Volume": "21,358", + "Unit": "sentences" + }, + { + "Name": "Moroccan", + "Dialect": "ar-MA: (Arabic (Morocco))", + "Volume": "20,735", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/arbml/Twt15DA_Lists", + "Link": "https://github.com/Maha-J-Althobaiti/Twt15DA_Lists", + "License": "CC BY-NC-ND 4.0", + "Year": 2021, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(translation)", + "Description": "The annotated dialectal Arabic corpus (Twt15DA) is collected from Twitter and consists of 311,785 tweets containing 3,858,459 words in total. They randomly selected a sample of 75 tweets per country, 1125 tweets in total, and conducted a manual dialect identification task by native speakers.", + "Volume": "311,785", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Taif University", + "Derived From": "nan", + "Paper Title": "Creation of annotated country-level dialectal Arabic resources: An unsupervised approach", + "Paper Link": "https://web.archive.org/web/20210813220628id_/https://www.cambridge.org/core/services/aop-cambridge-core/content/view/2DE64B777EF0277557AFA90E2BB75B62/S135132492100019Xa.pdf/div-class-title-creation-of-annotated-country-level-dialectal-arabic-resources-an-unsupervised-approach-div.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dialect identification", + "Venue Title": "Cambridge University Press", + "Citations": "1.0", + "Venue Type": "journal", + "Venue Name": "Natural Language Engineering (2021), Cambridge University Press", + "Authors": "Maha J. Althobaiti", + "Affiliations": "Department of Computer Science, College of Computers and Information Technology, Taif University", + "Abstract": "The wide usage of multiple spoken Arabic dialects on social networking sites stimulates increasing interest in Natural Language Processing (NLP) for dialectal Arabic (DA). Arabic dialects represent true linguistic diversity and differ from modern standard Arabic (MSA). In fact, the complexity and variety of these dialects make it insufficient to build one NLP system that is suitable for all of them. In comparison with MSA, the available datasets for various dialects are generally limited in terms of size, genre and scope. In this article, we present a novel approach that automatically develops an annotated country-level dialectal Arabic corpus and builds lists of words that encompass 15 Arabic dialects. The algorithm uses an iterative procedure consisting of two main components: automatic creation of lists for dialectal words and automatic creation of annotated Arabic dialect identification corpus. To our knowledge, our study is the first of its kind to examine and analyse the poor performance of the MSA part-of-speech tagger on dialectal Arabic contents and to exploit that in order to extract the dialectal words. The pointwise mutual information association measure and the geographical frequency of word occurrence online are used to classify dialectal words. The annotated dialectal Arabic corpus (Twt15DA), built using our algorithm, is collected from Twitter and consists of 311,785 tweets containing 3,858,459 words in total. We randomly selected a sample of 75 tweets per country, 1125 tweets in total, and conducted a manual dialect identification task by native speakers. The results show an average inter-annotator agreement score equal to 64%, which reflects satisfactory agreement considering the overlapping features of the 15 Arabic dialects.", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/tydiqa.json b/datasets/tydiqa.json new file mode 100644 index 0000000..30d6baf --- /dev/null +++ b/datasets/tydiqa.json @@ -0,0 +1,36 @@ +{ + "Name": "TYDIQA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/google-research-datasets/tydiqa", + "Link": "https://github.com/google-research-datasets/tydiqa", + "License": "Apache-2.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "question answering dataset covering 11 typologically diverse languages with 200K question-answer pairs", + "Volume": "25,893", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Google", + "Derived From": "nan", + "Paper Title": "TYDI QA: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages", + "Paper Link": "https://storage.googleapis.com/tydiqa/tydiqa.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "TACL", + "Citations": "91.0", + "Venue Type": "journal", + "Venue Name": "Transactions of the Association for Computational Linguistics", + "Authors": "J. Clark,Eunsol Choi,Michael Collins,Dan Garrette,T. Kwiatkowski,Vitaly Nikolaev,Jennimaria Palomaki", + "Affiliations": ",,,Google Research,,,", + "Abstract": "Abstract Confidently making progress on multilingual modeling requires challenging, trustworthy evaluations. We present TyDi QA\u2014a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs. The languages of TyDi QA are diverse with regard to their typology\u2014the set of linguistic features each language expresses\u2014such that we expect models performing well on this set to generalize across a large number of the world\u2019s languages. We present a quantitative analysis of the data quality and example-level qualitative linguistic analyses of observed language phenomena that would not be found in English-only corpora. To provide a realistic information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but don\u2019t know the answer yet, and the data is collected directly in each language without the use of translation.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/udp_(udp-nyuad).json b/datasets/udp_(udp-nyuad).json new file mode 100644 index 0000000..bdf1ef2 --- /dev/null +++ b/datasets/udp_(udp-nyuad).json @@ -0,0 +1,36 @@ +{ + "Name": "UDP (UDP-NYUAD)", + "Subsets": [], + "HF Link": "https://hf.co/datasets/universal_dependencies/viewer/ar_nyuad", + "Link": "https://github.com/UniversalDependencies/UD_Arabic-NYUAD", + "License": "CC BY-SA 4.0", + "Year": 2017, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "The treebank consists of 19,738 sentences (738889 tokens), and its domain is mainly newswire.", + "Volume": "738,889", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions, NYUAD", + "Derived From": "The NYUAD Arabic UD treebank is based on the Penn Arabic Treebank (PATB), parts 1, 2, and 3, through conversion to CATiB dependency trees.\n", + "Paper Title": "Universal Dependencies for Arabic", + "Paper Link": "https://aclanthology.org/W17-1320.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "morphological attribute tagging, dependency parsing, part of speech tagging\n", + "Venue Title": "WANLP", + "Citations": "12.0", + "Venue Type": "workshop", + "Venue Name": "Arabic Natural Language Processing Workshop", + "Authors": "Dima Taji,Nizar Habash,Daniel Zeman", + "Affiliations": ",,", + "Abstract": "We describe the process of creating NUDAR, a Universal Dependency treebank for Arabic. We present the conversion from the Penn Arabic Treebank to the Universal Dependency syntactic representation through an intermediate dependency representation. We discuss the challenges faced in the conversion of the trees, the decisions we made to solve them, and the validation of our conversion. We also present initial parsing results on NUDAR.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/ufal_parallel_corpus_of_north_levantine_1_0.json b/datasets/ufal_parallel_corpus_of_north_levantine_1_0.json new file mode 100644 index 0000000..36db87b --- /dev/null +++ b/datasets/ufal_parallel_corpus_of_north_levantine_1_0.json @@ -0,0 +1,36 @@ +{ + "Name": "UFAL Parallel Corpus of North Levantine 1.0", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/UFAL", + "Link": "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5033", + "License": "unknown", + "Year": 2023, + "Language": "multilingual", + "Dialect": "ar-LEV: (Arabic (Levant))", + "Domain": "transcribed audio", + "Form": "text", + "Collection Style": "human translation", + "Description": "120,600 multiparallel sentences in English, French, German, Greek, Spanish, and Standard Arabic selected from the OpenSubtitles2018 corpus [1] and manually translated into the North Levantine Arabic language.", + "Volume": "120,600", + "Unit": "sentences", + "Ethical Risks": "nan", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "Multi-Parallel Corpus of North Levantine Arabic", + "Paper Link": "https://aclanthology.org/2023.arabicnlp-1.34/", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, language modeling, dialect identification", + "Venue Title": "ArabicNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Arabic Natural Language Processing Conference", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Low-resource Machine Translation (MT) is characterized by the scarce availability of training data and/or standardized evaluation benchmarks. In the context of Dialectal Arabic, recent works introduced several evaluation benchmarks covering both Modern Standard Arabic (MSA) and dialects, mapping, however, mostly to a single Indo-European language - English. In this work, we introduce a multi-lingual corpus consisting of 120,600 multi-parallel sentences in English, French, German, Greek, Spanish, and MSA selected from the OpenSubtitles corpus, which were manually translated into the North Levantine Arabic. By conducting a series of training and fine-tuning experiments, we explore how this novel resource can contribute to the research on Arabic MT.", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/ultimate_arabic_news_dataset.json b/datasets/ultimate_arabic_news_dataset.json new file mode 100644 index 0000000..3682e1c --- /dev/null +++ b/datasets/ultimate_arabic_news_dataset.json @@ -0,0 +1,36 @@ +{ + "Name": "Ultimate Arabic News Dataset", + "Subsets": [], + "HF Link": "https://hf.co/datasets/khalidalt/ultimate_arabic_news", + "Link": "https://data.mendeley.com/datasets/jz56k5wxz7/1", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "The Ultimate Arabic News Dataset is a collection of single-label modern Arabic texts that are used in news websites and press articles.", + "Volume": "381,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Yalova Universitesi", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Al-Dulaimi, Ahmed Hashim", + "Affiliations": "Yalova Universitesi", + "Abstract": "nan", + "Added By": "Khalid Almubarak" +} \ No newline at end of file diff --git a/datasets/un_multi.json b/datasets/un_multi.json new file mode 100644 index 0000000..9d30bc6 --- /dev/null +++ b/datasets/un_multi.json @@ -0,0 +1,36 @@ +{ + "Name": "un_multi", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/multiun", + "Link": "https://hf.co/datasets/un_multi", + "License": "unknown", + "Year": 2010, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "This is a collection of translated documents from the United Nations.", + "Volume": "300,000,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "UN", + "Derived From": "nan", + "Paper Title": "MultiUN: A Multilingual Corpus from United Nation Documents", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2010/pdf/686_Paper.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "228.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Andreas Eisele, Yu Chen", + "Affiliations": "German Research Center for Artificial Intelligence (DFKI)", + "Abstract": "This paper describes the acquisition, preparation and properties of a corpus extracted from the official documents of the United Nations (UN). This corpus is available in all 6 official languages of the UN, consisting of around 300 million words per language. We describe the methods we used for crawling, document formatting, and sentence alignment. This corpus also includes a common test set for machine translation. We present the results of a French-Chinese machine translation experiment performed on this corpus.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/understanding_and_detecting_dangerous_speech_in_social_media.json b/datasets/understanding_and_detecting_dangerous_speech_in_social_media.json new file mode 100644 index 0000000..de460cb --- /dev/null +++ b/datasets/understanding_and_detecting_dangerous_speech_in_social_media.json @@ -0,0 +1,36 @@ +{ + "Name": "Understanding and Detecting Dangerous Speech in Social Media", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/Dangerous_Dataset", + "Link": "https://github.com/UBC-NLP/Arabic-Dangerous-Dataset", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "mixed", + "Domain": "social media", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Dangerous speech detection", + "Volume": "5,000", + "Unit": "sentences", + "Ethical Risks": "High", + "Provider": "The University of British Columbia", + "Derived From": "nan", + "Paper Title": "Understanding and Detecting Dangerous Speech in Social Media", + "Paper Link": "https://arxiv.org/pdf/2005.06608.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "dangerous speech detection", + "Venue Title": "arXiv", + "Citations": "8.0", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Ali Alshehri, El Moatez Billah Nagoudi, Muhammad Abdul-Mageed", + "Affiliations": "The University of British Columbia", + "Abstract": "Social media communication has become a significant part of daily activity in modern societies. For this reason, ensuring safety in social media platforms is a necessity. Use of dangerous language such as physical threats in online environments is a somewhat rare, yet remains highly important. Although several works have been performed on the related issue of detecting offensive and hateful language, dangerous speech has not previously been treated in any significant way. Motivated by these observations, we report our efforts to build a labeled dataset for dangerous speech. We also exploit our dataset to develop highly effective models to detect dangerous content. Our best model performs at 59.60% macro F1, significantly outperforming a competitive baseline.", + "Added By": "Abdelrahman Kaseb" +} \ No newline at end of file diff --git a/datasets/unified_linguistic_annotation_text_collection.json b/datasets/unified_linguistic_annotation_text_collection.json new file mode 100644 index 0000000..19762d5 --- /dev/null +++ b/datasets/unified_linguistic_annotation_text_collection.json @@ -0,0 +1,36 @@ +{ + "Name": "Unified Linguistic Annotation Text Collection", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2009T07", + "License": "LDC User Agreement for Non-Members", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Please view this\u00a0LDC2009T10 sample\u00a0and\u00a0LDC2009T11 sample.", + "Volume": "22,500", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "summarization,sociolinguistics,question-answering,psycholinguistics,pragmatics,information retrieval", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/unimorph.json b/datasets/unimorph.json new file mode 100644 index 0000000..4000fa5 --- /dev/null +++ b/datasets/unimorph.json @@ -0,0 +1,36 @@ +{ + "Name": "UniMorph", + "Subsets": [], + "HF Link": "https://hf.co/datasets/unimorph/universal_morphologies", + "Link": "https://github.com/unimorph/ara", + "License": "CC BY-SA 3.0", + "Year": 2015, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "167 languages have been annotated according to the UniMorph schema. ", + "Volume": "140003", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Johns Hopkins University", + "Derived From": "nan", + "Paper Title": "The Composition and Use of the Universal Morphological Feature Schema (UniMorph Schema)", + "Paper Link": "https://unimorph.github.io/doc/unimorph-schema.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "morphological analysis", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "John Sylak-Glassman\n", + "Affiliations": "Center for Language and Speech Processing Johns Hopkins University", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/united_nations_general_assembly_resolutions.json b/datasets/united_nations_general_assembly_resolutions.json new file mode 100644 index 0000000..d307b1e --- /dev/null +++ b/datasets/united_nations_general_assembly_resolutions.json @@ -0,0 +1,36 @@ +{ + "Name": "United Nations General Assembly Resolutions", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/un_ga", + "Link": "https://opus.nlpl.eu/UN.php", + "License": "unknown", + "Year": 2009, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling", + "Description": "This is a collection of translated documents from the United Nations originally compiled into a translation memory by Alexandre Rafalovitch, Robert Dale", + "Volume": "73,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "OPUS", + "Derived From": "nan", + "Paper Title": "United Nations General Assembly Resolutions: A Six-Language Parallel Corpus", + "Paper Link": "https://aclanthology.org/2009.mtsummit-posters.15.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "mtsummit", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Machine Translation Summit XII", + "Authors": "Alexandre Rafalovitch, Robert Dale", + "Affiliations": "United Nations; Centre for Language Technology Macquarie University", + "Abstract": "In this paper we describe a six-ways parallel public-domain corpus consisting of 2100\nUnited Nations General Assembly Resolutions with translations in the six official languages of the United Nations, with an average of around 3 million tokens per language. The corpus is available in a preprocessed, formatting-normalized TMX format with paragraphs aligned across multiple\nlanguages. We describe the background to the\ncorpus and its content, the process of its construction, and some of its interesting properties.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/united_nations_parallel_corpus.json b/datasets/united_nations_parallel_corpus.json new file mode 100644 index 0000000..eee8a77 --- /dev/null +++ b/datasets/united_nations_parallel_corpus.json @@ -0,0 +1,36 @@ +{ + "Name": "United Nations Parallel Corpus", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Helsinki-NLP/multiun", + "Link": "https://conferences.unite.un.org/uncorpus", + "License": "custom", + "Year": 2016, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "The parallel corpus presented consists of manually translated UN documents from the last 25 years", + "Volume": "540,152", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "United Nations", + "Derived From": "nan", + "Paper Title": "The United Nations Parallel Corpus v1.0", + "Paper Link": "https://conferences.unite.un.org/UNCORPUS/Content/Doc/un.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "LREC", + "Citations": "233.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Michal Ziemski,Marcin Junczys-Dowmunt,B. Pouliquen", + "Affiliations": ",,", + "Abstract": "This paper describes the creation process and statistics of the official United Nations Parallel Corpus, the first parallel corpus composed from United Nations documents published by the original data creator. The parallel corpus presented consists of manually translated UN documents from the last 25 years (1990 to 2014) for the six official UN languages, Arabic, Chinese, English, French, Russian, and Spanish. The corpus is freely available for download under a liberal license. Apart from the pairwise aligned documents, a fully aligned subcorpus for the six official UN languages is distributed. We provide baseline BLEU scores of our Moses-based SMT systems trained with the full data of language pairs involving English and for all possible translation directions of the six-way subcorpus.", + "Added By": "nan" +} \ No newline at end of file diff --git a/datasets/united_nations_proceedings_speech.json b/datasets/united_nations_proceedings_speech.json new file mode 100644 index 0000000..3cb0dcc --- /dev/null +++ b/datasets/united_nations_proceedings_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "United Nations Proceedings Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2014S08", + "License": "LDC User Agreement for Non-Members", + "Year": 2014, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "other", + "Description": "Data is presented either as mp3 or flac compressed wav and are 16-bit single channel files in either 22,050 or 8,000 Hz organized by committee and session number, then language. The folder labeled \"Floor\" indicates the microphone used by the particular speaker. Those files may include other languages, for instance, if the speaker's language was not among the six official UN languages.", + "Volume": "8,500", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "LDC", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "5,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition,language identification", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/universal_dependencies.json b/datasets/universal_dependencies.json new file mode 100644 index 0000000..b27e17a --- /dev/null +++ b/datasets/universal_dependencies.json @@ -0,0 +1,55 @@ +{ + "Name": "Universal Dependencies", + "Subsets": [ + { + "Name": "ar_nyuad", + "Dialect": "nan", + "Volume": "738,889", + "Unit": "tokens" + }, + { + "Name": "ar_padt", + "Dialect": "nan", + "Volume": "282,384", + "Unit": "tokens" + }, + { + "Name": "ar_pud", + "Dialect": "nan", + "Volume": "20,751", + "Unit": "tokens" + } + ], + "HF Link": "https://hf.co/datasets/universal_dependencies", + "Link": "https://github.com/UniversalDependencies", + "License": "unknown", + "Year": 2020, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages.", + "Volume": "1,042,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Universal Dependencies(UD)", + "Derived From": "UDP (UDP-NYUAD), PADT, PUD", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "part of speech tagging, morphological features, syntactic dependencies", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/watan-2004.json b/datasets/watan-2004.json new file mode 100644 index 0000000..9c1c5a5 --- /dev/null +++ b/datasets/watan-2004.json @@ -0,0 +1,36 @@ +{ + "Name": "Watan-2004", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/watan_2004", + "Link": "https://sourceforge.net/projects/arabiccorpus/files/", + "License": "unknown", + "Year": 2010, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "Watan-2004 corpus contains about 20000 articles talking about the six following topics \"categories\": Culture, Religion, Economy, Local News, International News and sports. In this corpus, punctuation has been omitted intentionally in order to make it useful for Language Modeling.", + "Volume": "20,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Comparing TR-Classifier and KNN by using Reduced Sizes of Vocabularies ", + "Paper Link": "https://hal.archives-ouvertes.fr/hal-01586533/document", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "topic classification", + "Venue Title": "CITALA", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "International Conference on Arabic Language Processing ", + "Authors": "M. Abbas, K. Smaili, and D. Berkani", + "Affiliations": "CRSTDLA /Speech Processing Laboratory;NRIA-LORIA/Parole team, Villers les Nancy;NPS/ Signal and Communication laboratory", + "Abstract": "The aim of this study is topic identification by\nusing two methods, in this case, a new one that we have\nproposed: TR-classifier which is based on computing\ntriggers, and the well-known k Nearest Neighbors.\nPerformances are acceptable, particularly for TR-classifier,\nthough we have used reduced sizes of vocabularies. For the\nTR-Classifier, each topic is represented by a vocabulary\nwhich has been built using the corresponding training\ncorpus. Whereas, the kNN method uses a general\nvocabulary, obtained by the concatenation of those used by\nthe TR-Classifier. For the evaluation task, six topics have\nbeen selected to be identified: Culture, religion, economy,\nlocal news, international news and sports. An Arabic corpus\nhas been used to achieve experiments. \n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/waw.json b/datasets/waw.json new file mode 100644 index 0000000..5c61c64 --- /dev/null +++ b/datasets/waw.json @@ -0,0 +1,36 @@ +{ + "Name": "WAW", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/WAW", + "Link": "https://alt.qcri.org/resources/wawcorpus/", + "License": "custom", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "human translation", + "Description": "an interpreting corpus for English/Arabic", + "Volume": "31", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "QCRI,Hamad Bin Khalifa University", + "Derived From": "nan", + "Paper Title": "The WAW Corpus: The First Corpus of Interpreted Speeches and their\r\nTranslations for English and Arabic", + "Paper Link": "https://alt.qcri.org/resources/wawcorpus/", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "QCRI Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "speech recognition, machine translation", + "Venue Title": "LREC", + "Citations": "1.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Language Resources and Evaluation", + "Authors": "Ahmed Abdelali,Irina Temnikova,S. Hedaya,S. Vogel", + "Affiliations": ",University of Wolverhampton, Bulgarian Academy of Sciences, Qatar Computing Research Institute, Mitra Translations,,", + "Abstract": "This article presents the WAW Corpus, an interpreting corpus for English/Arabic, which can be used for teaching interpreters, studying the characteristics of interpreters\u2019 work, as well as to train machine translation systems. The corpus contains recordings of lectures and speeches from international conferences, their interpretations, the transcripts of the original speeches and of their interpretations, as well as human translations of both kinds of transcripts into the opposite language of the language pair. The article presents the corpus curation, statistics, assessment, as well as a case study of the corpus use.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/wdc.json b/datasets/wdc.json new file mode 100644 index 0000000..a792a0a --- /dev/null +++ b/datasets/wdc.json @@ -0,0 +1,36 @@ +{ + "Name": "WDC", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/WDC", + "Link": "https://github.com/Maha-J-Althobaiti/Arabic_NER_Wiki-Corpus", + "License": "CC BY 3.0", + "Year": 2014, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "contains around\n6 million tokens representing different genres, as\nWikipedia is considered an open domain", + "Volume": "6,000,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "University of Essex", + "Derived From": "nan", + "Paper Title": "Automatic Creation of Arabic Named Entity Annotated Corpus Using\r\nWikipedia", + "Paper Link": "https://aclanthology.org/E14-3012.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "named entity recognition", + "Venue Title": "EACL", + "Citations": "14.0", + "Venue Type": "conference", + "Venue Name": "European Chapter of the Association for Computational Linguistics", + "Authors": "M. Althobaiti,Udo Kruschwitz,Massimo Poesio", + "Affiliations": ",University of Regensburg,", + "Abstract": "In this paper we propose a new methodology to exploit Wikipedia features and structure to automatically develop an Arabic NE annotated corpus. Each Wikipedia link is transformed into an NE type of the target article in order to produce the NE annotation. Other Wikipedia features - namely redirects, anchor texts, and inter-language links - are used to tag additional NEs, which appear without links in Wikipedia texts. Furthermore, we have developed a filtering algorithm to eliminate ambiguity when tagging candidate NEs. Herein we also introduce a mechanism based on the high coverage of Wikipedia in order to address two challenges particular to tagging NEs in Arabic text: rich morphology and the absence of capitalisation. The corpus created with our new method (WDC) has been used to train an NE tagger which has been tested on different domains. Judging by the results, an NE tagger trained on WDC can compete with those trained on manually annotated corpora.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/west_point_arabic_speech.json b/datasets/west_point_arabic_speech.json new file mode 100644 index 0000000..875e799 --- /dev/null +++ b/datasets/west_point_arabic_speech.json @@ -0,0 +1,36 @@ +{ + "Name": "West Point Arabic Speech", + "Subsets": [], + "HF Link": "nan", + "Link": "https://catalog.ldc.upenn.edu/LDC2002S02", + "License": "LDC User Agreement for Non-Members", + "Year": 2002, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "transcribed audio", + "Form": "spoken", + "Collection Style": "manual curation", + "Description": "The corpus consists of 8,516 speech files, totaling 1.7 gigabytes or 11.42 hours of speech data. Each speech file represents one person reciting one prompt from one of four prompt scripts. The utterances were recorded using a Shure SM10A microphone and a RANE Model MS1 pre-amplifier. The files were recorded as 16-bit PCM low-byte-first (\"little-endian\") raw audio files, with a sampling rate of 22.05 KHz. They were then converted to NIST sphere format. Approximately 7,200 of the recordings are from native informants and 1200 files are from non-native informants. The following tables show the breakdown of corpus content in terms of male, female, native and non-native speakers.", + "Volume": "11.42", + "Unit": "hours", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "LDC", + "Access": "With-Fee", + "Cost": "1,000.00 $", + "Test Split": "No", + "Tasks": "speech recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Stephen A. LaRocca, Rajaa Chouairi", + "Affiliations": "Department of Foreign languages at the United States Military Academy at West Point and the Center For Technology Enhanced Language Learning (CTELL)", + "Abstract": "nan", + "Added By": "Mustafa Ghaleb" +} \ No newline at end of file diff --git a/datasets/wikiann.json b/datasets/wikiann.json new file mode 100644 index 0000000..79f9a4b --- /dev/null +++ b/datasets/wikiann.json @@ -0,0 +1,36 @@ +{ + "Name": "wikiann", + "Subsets": [], + "HF Link": "https://hf.co/datasets/unimelb-nlp/wikiann", + "Link": "https://drive.google.com/drive/folders/1Q-xdT99SeaCghihGa7nRkcXGwRGUIsKN", + "License": "unknown", + "Year": 2017, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": " Both name tagging and linking results for 282 languages are promising on Wikipedia data and on-Wikipedia\r\ndata", + "Volume": "185,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Cross-lingual Name Tagging and Linking for 282 Languages\r", + "Paper Link": "https://aclanthology.org/P17-1178.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "Gdrive", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "named entity recognition", + "Venue Title": "ACL", + "Citations": "168.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Xiaoman Pan,Boliang Zhang,Jonathan May,J. Nothman,Kevin Knight,Heng Ji", + "Affiliations": ",,,,,", + "Abstract": "The ambitious goal of this work is to develop a cross-lingual name tagging and linking framework for 282 languages that exist in Wikipedia. Given a document in any of these languages, our framework is able to identify name mentions, assign a coarse-grained or fine-grained type to each mention, and link it to an English Knowledge Base (KB) if it is linkable. We achieve this goal by performing a series of new KB mining methods: generating \u201csilver-standard\u201d annotations by transferring annotations from English to other languages through cross-lingual links and KB properties, refining annotations through self-training and topic selection, deriving language-specific morphology features from anchor links, and mining word translation pairs from cross-lingual links. Both name tagging and linking results for 282 languages are promising on Wikipedia data and on-Wikipedia data.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/wikiann_ner(mmner).json b/datasets/wikiann_ner(mmner).json new file mode 100644 index 0000000..c8038f0 --- /dev/null +++ b/datasets/wikiann_ner(mmner).json @@ -0,0 +1,36 @@ +{ + "Name": "WikiANN NER(MMNER)", + "Subsets": [], + "HF Link": "https://hf.co/datasets/unimelb-nlp/wikiann", + "Link": "https://github.com/afshinrahimi/mmner", + "License": "unknown", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Cross-lingual name tagging and linking for 282 languages", + "Volume": "30,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "The university of Melbourne", + "Derived From": "nan", + "Paper Title": "Massively Multilingual Transfer for NER", + "Paper Link": "https://aclanthology.org/P19-1015.pdf", + "Script": "Arab", + "Tokenized": "Yes", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "named entity recognition", + "Venue Title": "ACL", + "Citations": "54.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Afshin Rahimi,Yuan Li,Trevor Cohn", + "Affiliations": "University of Melbourne,,University of Melbourne", + "Abstract": "In cross-lingual transfer, NLP models over one or more source languages are applied to a low-resource target language. While most prior work has used a single source model or a few carefully selected models, here we consider a \u201cmassive\u201d setting with many such models. This setting raises the problem of poor transfer, particularly from distant languages. We propose two techniques for modulating the transfer, suitable for zero-shot or few-shot learning, respectively. Evaluating on named entity recognition, we show that our techniques are much more effective than strong baselines, including standard ensembling, and our unsupervised method rivals oracle selection of the single best individual model.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/wikidocsaligner_dataset.json b/datasets/wikidocsaligner_dataset.json new file mode 100644 index 0000000..0b3972f --- /dev/null +++ b/datasets/wikidocsaligner_dataset.json @@ -0,0 +1,49 @@ +{ + "Name": "WikiDocsAligner Dataset", + "Subsets": [ + { + "Name": "Arabic", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Volume": "10,197", + "Unit": "documents" + }, + { + "Name": "Egyptian", + "Dialect": "ar-EGY: (Arabic (Egypt))", + "Volume": "10,197", + "Unit": "documents" + } + ], + "HF Link": "nan", + "Link": "https://github.com/motazsaad/egy-arb-dialect-id", + "License": "CC BY-SA 4.0", + "Year": 2017, + "Language": "ar", + "Dialect": "mixed", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "manual curation", + "Description": "WikiDocsAligner is a tool designed to align Wikipedia articles from different languages, creating comparable corpora. It can align documents between language pairs, including Arabic and its dialects. The tool uses Wikipedia dumps and interlanguage links to perform document alignment.", + "Volume": "20,394", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Islamic University of Gaza", + "Derived From": "Wikipedia", + "Paper Title": "WikiDocsAligner: an off-the-shelf Wikipedia Documents Alignment Tool", + "Paper Link": "https://doi.org/10.1109/PICICT.2017.27", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "cross-lingual information retrieval, document alignment, bilingual lexicon extraction, comparable corpus construction", + "Venue Title": "IPICICT", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Palestinian International Conference on Information and Communication Technology", + "Authors": "Motaz Saad, Basem O. Alijla", + "Affiliations": "Islamic University of Gaza", + "Abstract": " WikiDocsAligner is an off-the-shelf tool designed to align Wikipedia documents in different languages. The tool makes it easier for researchers to produce comparable corpora without the need for custom scripts. It was applied to align comparable articles between Standard Arabic Wikipedia and Egyptian Wikipedia, highlighting Wikipedia as a valuable resource for Arabic dialect corpora.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/wikimatrix.json b/datasets/wikimatrix.json new file mode 100644 index 0000000..557c9c9 --- /dev/null +++ b/datasets/wikimatrix.json @@ -0,0 +1,36 @@ +{ + "Name": "WikiMatrix", + "Subsets": [], + "HF Link": "https://hf.co/datasets/Tyler/wikimatrix_collapsed", + "Link": "https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix", + "License": "CC BY-SA", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "t 135M parallel sentences for\r\n1620 different language pairs, out of which\r\nonly 34M are aligned with English", + "Volume": "4,435", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "JHU, Facebook", + "Derived From": "nan", + "Paper Title": "WikiMatrix: Mining 135M Parallel Sentences\r\nin 1620 Language Pairs from Wikipedia", + "Paper Link": "https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation", + "Venue Title": "EACL", + "Citations": "116.0", + "Venue Type": "conference", + "Venue Name": "European Chapter of the Association for Computational Linguistics", + "Authors": "Holger Schwenk,Vishrav Chaudhary,Shuo Sun,Hongyu Gong,Francisco Guzm\u00e1n", + "Affiliations": ",,,University of Illinois at Urbana-Champaign,", + "Abstract": "We present an approach based on multilingual sentence embeddings to automatically extract parallel sentences from the content of Wikipedia articles in 96 languages, including several dialects or low-resource languages. We do not limit the extraction process to alignments with English, but we systematically consider all possible language pairs. In total, we are able to extract 135M parallel sentences for 16720 different language pairs, out of which only 34M are aligned with English. This corpus is freely available. To get an indication on the quality of the extracted bitexts, we train neural MT baseline systems on the mined data only for 1886 languages pairs, and evaluate them on the TED corpus, achieving strong BLEU scores for many language pairs. The WikiMatrix bitexts seem to be particularly interesting to train MT systems between distant languages without the need to pivot through English.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/wikimedia_wit_base.json b/datasets/wikimedia_wit_base.json new file mode 100644 index 0000000..665397f --- /dev/null +++ b/datasets/wikimedia_wit_base.json @@ -0,0 +1,36 @@ +{ + "Name": "wikimedia/wit_base", + "Subsets": [], + "HF Link": "https://hf.co/datasets/wikimedia/wit_base", + "Link": "https://github.com/google-research-datasets/wit", + "License": "CC BY-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "Wikimedia's version of the Wikipedia-based Image Text (WIT) Dataset, a large multimodal multilingual dataset.", + "Volume": "6,477,255", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Google", + "Derived From": "nan", + "Paper Title": "WIT: Wikipedia-based Image Text Dataset for Multimodal Multilingual Machine Learning", + "Paper Link": "https://arxiv.org/pdf/2103.01913.pdf", + "Script": "Arab-Latn", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "image captioning, text retrieval", + "Venue Title": "SIGIR '21", + "Citations": "40.0", + "Venue Type": "conference", + "Venue Name": "Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval", + "Authors": "Krishna Srinivasan, Karthik Raman, Jiecao Chen, Michael Bendersky, Marc Najork", + "Affiliations": "Google", + "Abstract": "The milestone improvements brought about by deep representation learning and pre-training techniques have led to large performance gains across downstream NLP, IR and Vision tasks. Multimodal modeling techniques aim to leverage large high-quality visio-linguistic datasets for learning complementary information (across image and text modalities). In this paper, we introduce the Wikipedia-based Image Text (WIT) Dataset (this https URL) to better facilitate multimodal, multilingual learning. WIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages. Its size enables WIT to be used as a pretraining dataset for multimodal models, as we show when applied to downstream tasks such as image-text retrieval. WIT has four main and unique advantages. First, WIT is the largest multimodal dataset by the number of image-text examples by 3x (at the time of writing). Second, WIT is massively multilingual (first of its kind) with coverage over 100+ languages (each of which has at least 12K examples) and provides cross-lingual texts for many images. Third, WIT represents a more diverse set of concepts and real world entities relative to what previous datasets cover. Lastly, WIT provides a very challenging real-world test set, as we empirically illustrate using an image-text retrieval task as an example.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/wikipedia.json b/datasets/wikipedia.json new file mode 100644 index 0000000..760e519 --- /dev/null +++ b/datasets/wikipedia.json @@ -0,0 +1,36 @@ +{ + "Name": "Wikipedia", + "Subsets": [], + "HF Link": "https://hf.co/datasets/legacy-datasets/wikipedia", + "Link": "https://dumps.wikimedia.org/", + "License": "CC BY-SA 3.0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "Wikipedia dataset containing cleaned articles of all languages. The datasets are built from the Wikipedia dump (https://dumps.wikimedia.org/) with one split per language.", + "Volume": "1,151,628", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Wikimedia", + "Derived From": "nan", + "Paper Title": "nan", + "Paper Link": "nan", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "text generation, language modeling", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "nan", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/wikiqaar.json b/datasets/wikiqaar.json new file mode 100644 index 0000000..bc1368a --- /dev/null +++ b/datasets/wikiqaar.json @@ -0,0 +1,36 @@ +{ + "Name": "WikiQAar", + "Subsets": [], + "HF Link": "https://hf.co/datasets/qcri/wiki_qa_ar", + "Link": "https://github.com/qcri/WikiQAar", + "License": "unknown", + "Year": 2018, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "WIKIQAar is a bilingual English--Arabic Question Answering corpus built on top of WIKIQA", + "Volume": "3,047", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "QCRI", + "Derived From": "WikiQA", + "Paper Title": "WIKIQA: A Challenge Dataset for Open-Domain Question Answering", + "Paper Link": "https://aclanthology.org/D15-1237.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "question answering", + "Venue Title": "EMNLP", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Empirical Methods in Natural Language Processing", + "Authors": "Yi Yang, Wen-tau Yih Christopher Meek", + "Affiliations": "Georgia Institute of Technology, Microsoft, Microsoft", + "Abstract": "We describe the WIKIQA dataset, a new\npublicly available set of question and sentence pairs, collected and annotated for research on open-domain question answering. Most previous work on answer sentence selection focuses on a dataset created using the TREC-QA data, which\nincludes editor-generated questions and\ncandidate answer sentences selected by\nmatching content words in the question.\nWIKIQA is constructed using a more natural process and is more than an order of\nmagnitude larger than the previous dataset.\nIn addition, the WIKIQA dataset also includes questions for which there are no\ncorrect sentences, enabling researchers to\nwork on answer triggering, a critical component in any QA system. We compare\nseveral systems on the task of answer sentence selection on both datasets and also\ndescribe the performance of a system on\nthe problem of answer triggering using the\nWIKIQA dataset.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/wili-2018.json b/datasets/wili-2018.json new file mode 100644 index 0000000..ad5c753 --- /dev/null +++ b/datasets/wili-2018.json @@ -0,0 +1,36 @@ +{ + "Name": "WiLI-2018", + "Subsets": [], + "HF Link": "https://hf.co/datasets/MartinThoma/wili_2018", + "Link": "https://zenodo.org/record/841984#.YpBRIahBxD8", + "License": "ODbL-1.0", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "WiLI-2018, the Wikipedia language identification benchmark dataset, contains 235000 paragraphs of 235 languages. The dataset is balanced and a train-test split is provided.", + "Volume": "1,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "-", + "Derived From": "nan", + "Paper Title": "The WiLI benchmark dataset for written language identification", + "Paper Link": "https://arxiv.org/pdf/1801.07779.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "zenodo", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language identification", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Martin Thoma", + "Affiliations": "nan", + "Abstract": "This paper describes the WiLI-2018 benchmark\ndataset for monolingual written natural language identification.\nWiLI-2018 is a publicly available,1\nfree of charge dataset of\nshort text extracts from Wikipedia. It contains 1000 paragraphs\nof 235 languages, totaling in 235 000 paragraphs. WiLI is a\nclassification dataset: Given an unknown paragraph written in\none dominant language, it has to be decided which language it\nis.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/winomt_(mt_gender).json b/datasets/winomt_(mt_gender).json new file mode 100644 index 0000000..cf6ddc2 --- /dev/null +++ b/datasets/winomt_(mt_gender).json @@ -0,0 +1,36 @@ +{ + "Name": "WinoMT (MT_Gender)", + "Subsets": [], + "HF Link": "https://hf.co/datasets/arbml/mt_gender_ar", + "Link": "https://github.com/gabrielStanovsky/mt_gender", + "License": "MIT License", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "Evaluating Gender Bias in Machine Translation", + "Volume": "3,888", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions", + "Derived From": "nan", + "Paper Title": "Evaluating Gender Bias in Machine Translation", + "Paper Link": "https://arxiv.org/pdf/1906.00591.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "machine translation", + "Venue Title": "ACL", + "Citations": "99.0", + "Venue Type": "conference", + "Venue Name": "Assofications of computation linguisitcs", + "Authors": "Gabriel Stanovsky,Noah A. Smith,Luke Zettlemoyer", + "Affiliations": "Allen Institute for Artificial Intelligence;University of Washington,,University of Washington;Facebook", + "Abstract": "We present the first challenge set and evaluation protocol for the analysis of gender bias in machine translation (MT). Our approach uses two recent coreference resolution datasets composed of English sentences which cast participants into non-stereotypical gender roles (e.g., \u201cThe doctor asked the nurse to help her in the operation\u201d). We devise an automatic gender bias evaluation method for eight target languages with grammatical gender, based on morphological analysis (e.g., the use of female inflection for the word \u201cdoctor\u201d). Our analyses show that four popular industrial MT systems and two recent state-of-the-art academic MT models are significantly prone to gender-biased translation errors for all tested target languages. Our data and code are publicly available at https://github.com/gabrielStanovsky/mt_gender.", + "Added By": "Maraim Masoud" +} \ No newline at end of file diff --git a/datasets/wojood.json b/datasets/wojood.json new file mode 100644 index 0000000..534f437 --- /dev/null +++ b/datasets/wojood.json @@ -0,0 +1,36 @@ +{ + "Name": "Wojood", + "Subsets": [], + "HF Link": "nan", + "Link": "https://ontology.birzeit.edu/Wojood/", + "License": "custom", + "Year": 2022, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Wojood consists of about 550K tokens (MSA and dialect) that are manually annotated with 21 entity types (e.g., person, organization, location, event, date, etc). It covers multiple domains and was annotated with nested entities. The corpus contains about 75K entities and 22.5% of which are nested.", + "Volume": "550,464", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Wojood: Nested Arabic Named Entity Corpus and Recognition using BERT", + "Paper Link": "https://arxiv.org/pdf/2205.09651.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "named entity recognition", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Mustafa Jarrar, Mohammed Khalilia, Sana Ghanem", + "Affiliations": "Birzeit University", + "Abstract": "This paper presents Wojood, a corpus for Arabic nested Named Entity Recognition (NER). Nested entities occur when one\nentity mention is embedded inside another entity mention. Wojood consists of about 550K Modern Standard Arabic (MSA) and\ndialect tokens that are manually annotated with 21 entity types including person, organization, location, event and date. More\nimportantly, the corpus is annotated with nested entities instead of the more common flat annotations. The data contains about\n75K entities and 22.5% of which are nested. The inter-annotator evaluation of the corpus demonstrated a strong agreement\nwith Cohen\u2019s Kappa of 0.979 and an F1-score of 0.976. To validate our data, we used the corpus to train a nested NER model\nbased on multi-task learning using the pre-trained AraBERT (Arabic BERT). The model achieved an overall micro F1-score of\n0.884. Our corpus, the annotation guidelines, the source code and the pre-trained model are publicly available.\n", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/wojoodfine.json b/datasets/wojoodfine.json new file mode 100644 index 0000000..0993a24 --- /dev/null +++ b/datasets/wojoodfine.json @@ -0,0 +1,36 @@ +{ + "Name": "WojoodFine", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/wojood/", + "License": "CC BY 4.0", + "Year": 2023, + "Language": "ar", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "WojoodFine is an extension of Wojood and consists of about 550K tokens (MSA and dialect) that are manually annotated with 21 entity types and four main entity types in Wojood (GPE, LOC, ORG, and FAC) are annotated with 31 new fine-grained subtypes. It covers multiple domains and was annotated with nested entities. The corpus contains about 75K entities and 22.5% of which are nested. A nested named entity recognition (NER) model based on BERT was trained (F1-score 92.29.4%).", + "Volume": "550,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Arabic Fine-Grained Entity Recognition.", + "Paper Link": "https://arxiv.org/abs/2310.17333", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language modeling, Tokenization, named entity recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Haneen Liqreina, Mustafa Jarrar, Mohammed Khalilia, Ahmed Oumar El-Shangiti, Muhammad AbdulMageed", + "Affiliations": "nan", + "Abstract": "Traditional NER systems are typically trained to recognize coarse-grained entities, and less attention is given to classifying entities into a hierarchy of fine-grained lower-level subtypes. This article aims to advance Arabic NER with finegrained entities. We chose to extend Wojood (an open-source Nested Arabic Named Entity Corpus) with subtypes. In particular, four main entity types in Wojood, geopolitical entity (GPE), location (LOC), organization (ORG), and facility (FAC), are extended with 31 subtypes. To do this, we first revised Wojood\u2019s annotations of GPE, LOC, ORG, and FAC to be compatible with the LDC\u2019s ACE guidelines, which yielded 5, 614 changes. Second, all mentions of GPE, LOC, ORG, and FAC (\u223c 44K) in Wojood are manually annotated with the LDC\u2019s ACE subtypes. We refer to this extended version of Wojood as WojoodFine. To evaluate our annotations, we measured the inter-annotator agreement (IAA) using both Cohen\u2019s Kappa and F1 score, resulting in 0.9861 and 0.9889, respectively. To compute the baselines of WojoodFine, we finetune three pre-trained Arabic BERT encoders in three settings: flat NER, nested NER and nested NER with subtypes and achieved F1 score of 0.920, 0.866, and 0.885, respectively.", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/wojoodgaza.json b/datasets/wojoodgaza.json new file mode 100644 index 0000000..42eb087 --- /dev/null +++ b/datasets/wojoodgaza.json @@ -0,0 +1,36 @@ +{ + "Name": "WojoodGaza", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/wojood/", + "License": "CC BY 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "It is an extestion of Wojood. It is about the ongoing Israeli War on Gaza, based on the assumption that discourse about recent global events will involve mentions from different data distributions. The dataset is collected from five news domains related to the War (Health, Economics, Finance, Politics, and Law). It consists of 60k tokens, divided into a test set (50k) and a development set (10k), with the domains evenly distributed.y. It is manually annotated with fine-grained named entities, following the same annotation guidelines as WojoodFine.", + "Volume": "60,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "WojoodNER 2024: The Second Arabic Named Entity Recognition Shared Task.", + "Paper Link": "https://arxiv.org/abs/2407.09936", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language modeling, Tokenization, named entity recognition", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Mustafa Jarrar, Nagham Hamad, Mohammed Khalilia, Bashar Talafha, AbdelRahim Elmadany, Muhammad Abdul-Mageed", + "Affiliations": "nan", + "Abstract": "We present WojoodNER-2024, the second Arabic Named Entity Recognition (NER) Shared Task. In WojoodNER-2024, we focus on fine-grained Arabic NER. We provided participants with a new Arabic fine-grained NER dataset called wojoodfine, annotated with subtypes of entities. WojoodNER-2024 encompassed three subtasks: (i) Closed-Track Flat Fine-Grained NER, (ii) Closed-Track Nested Fine-Grained NER, and (iii) an Open-Track NER for the Israeli War on Gaza. A total of 43 unique teams registered for this shared task. Five teams participated in the Flat Fine-Grained Subtask, among which two teams tackled the Nested Fine-Grained Subtask and one team participated in the Open-Track NER Subtask. The winning teams achieved F-1 scores of 91% and 92% in the Flat Fine-Grained and Nested Fine-Grained Subtasks, respectively. The sole team in the Open-Track Subtask achieved an F-1 score of 73.7%.", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/wojoodhadath.json b/datasets/wojoodhadath.json new file mode 100644 index 0000000..68bef17 --- /dev/null +++ b/datasets/wojoodhadath.json @@ -0,0 +1,36 @@ +{ + "Name": "WojoodHadath", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sina.birzeit.edu/relations/", + "License": "CC BY 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "mixed", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "Extends the Wojood dataset by incorporating relations into Wojood's nested structure. The added relations include hasAgent, hasLocation, and hasDate. The dataset, provided in JSON format, consists of sentences, each containing one or more events along with their corresponding arguments. It is divided into three subsets: training, validation, and test.", + "Volume": "550,000", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "SinaLab, Birzeit University", + "Derived From": "nan", + "Paper Title": "Event-Arguments Extraction Corpus and Modeling using BERT for Arabic", + "Paper Link": "https://arxiv.org/abs/2407.21153", + "Script": "Arab-Latn", + "Tokenized": "Yes", + "Host": "SinaLab Resources", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "language modeling, Tokenization, named entity recognition, natural language inference", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Alaa Aljabari, Lina Duaibes, Mustafa Jarrar, Mohammed Khalilia", + "Affiliations": "nan", + "Abstract": "Event-argument extraction is a challenging task, particularly in Arabic due to sparse linguistic resources. To fill this gap, we introduce the \\hadath corpus (550k tokens) as an extension of Wojood, enriched with event-argument annotations. We used three types of event arguments: agent, location, and date, which we annotated as relation types. Our inter-annotator agreement evaluation resulted in 82.23% Kappa score and 87.2% F1-score. Additionally, we propose a novel method for event relation extraction using BERT, in which we treat the task as text entailment. This method achieves an F1-score of 94.01%. To further evaluate the generalization of our proposed method, we collected and annotated another out-of-domain corpus (about 80k tokens) called \\testNLI and used it as a second test set, on which our approach achieved promising results (83.59% F1-score). Last but not least, we propose an end-to-end system for event-arguments extraction. This system is implemented as part of SinaTools, and both corpora are publicly available at:https://sina.birzeit.edu/wojood/", + "Added By": "Tymaa Hammouda" +} \ No newline at end of file diff --git a/datasets/wsd.json b/datasets/wsd.json new file mode 100644 index 0000000..c369e06 --- /dev/null +++ b/datasets/wsd.json @@ -0,0 +1,36 @@ +{ + "Name": "WSD", + "Subsets": [], + "HF Link": "nan", + "Link": "https://data.mendeley.com/datasets/pmdbs9tby8/1", + "License": "CC BY-NC 4.0", + "Year": 2024, + "Language": "ar", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "crawling and annotation(other)", + "Description": "A dataset for Arabic Word Sense Disambiguation (WSD) consisting of 3670 labeled examples of 100 polysemous Arabic words. It provides multiple senses for each word, annotated with real-world and GPT-generated sentences.", + "Volume": "3,670", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Zayed University", + "Derived From": "nan", + "Paper Title": "A comprehensive dataset for Arabic word sense disambiguation", + "Paper Link": "https://pdf.sciencedirectassets.com/311593/1-s2.0-S2352340924X00049/1-s2.0-S2352340924005584/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEEYaCXVzLWVhc3QtMSJHMEUCICyC47GOQc9rLG43raI5rqQ0ZIKcmy0aY5HV7%2BtawE9AAiEAu8Pqw0sxjMstvisjLSOl1hloKtKd12xsZctWRKBjshAqsgUIbxAFGgwwNTkwMDM1NDY4NjUiDBv72KIPY2vlMA70wSqPBVqKd9bfxEDEjF0mw337Ir7hLqqWyfrDSahUdhtprlQHWapI3eNO3ShCK6MGhMVvb0ldLVsj6gy0L20tQ%2BtyWneqKrTVNtF6SQnod6c2z4m5ec7Mph8mHXjjZfp%2BTh0YsajjOuCJ37TjwwszAoiiLFGOH3S52MkBO3z5eRp3jXbMRVoTN%2F7yslstJO1yTXCPt3N6dK0FEfnxGRFB85Bmov%2FsKX86jIf0H%2FAutZ2dx7S0M8rRbzek8X%2B%2FpVHnEPX8TYbaUemNDp3d1QkJtR3iWDj0jOQmUsHUeR8D1vs52rpnxqOun1sHQx6l7hrSCdi7SVX61MllgSpQvMSEKZ1%2BvovE5IuopSPKrGfhTfKTuAXsNEbnguLkjuIoOEh0krzBi3r%2F6np5BIZig0O4L4Rq4d0NQv99xkIGNengxYfLElTI3n%2Blo4qZ7edbzLOwRUhuu%2F68lEkzXdWq9wuR2Wh3QDGl6bmFgJO7DfPEi8eRnoHcLN6TaAhwi7ewukEyBFHUy4yaXvNWs15KFt3tWjrZBzFrnM3QYN5vXckL%2FoU6PYd6U9D5h3FsfmlDczewmhQaSBzHKkCkQkue4yNC77ltxYrQ00BY6HLpgSkHLJejHQg%2F%2FfYs8eX11hGkZcXqVHCrX73VHShauLD0H9wE%2BHneGG9SNVh9ZAjxlRfd%2BbMOSBe%2BcFqMlv%2FRPJew1UPcRh%2BcUf6dv0YLIZEw3mXpOk%2FAaimQEsPpkwrDDr4uUITxE%2FVvwjrcY1x%2BmqqxLlvTNiWjCdKZbRM%2FLjBEhqMgcvG6IJS%2Fll6XkjTJ2BfeOkhd4r53Qs1sINsZB450DHRV9SuJCV%2Be5obOsw3GeZeB8I7RGd01V%2FPa5cUrnc8dMrl9gQww5LD%2FtgY6sQGEoDpPPGTo%2Bkmx2mz03L35xMZe0Mdp7Y27QOHulY%2FWeHW0SGiOQIcJB3Va1cvp%2Fn9hTIZwyEX45utfTqAOSVJUQ0RqvsFqu8QwlgM7OxEWVLtSMn6z%2FuuyCPcYWMrWtFyEBzyZ3%2Bpqvf0S2LKJPFzWABxofP%2BpA5bEDDCwzjYaR%2Fy7jIh7ZAImIJwXYNYketf9CEP0YU7WHBQmtPuzyRRd%2FPa7W7ondBhLh0OStDwQLFA%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240910T060944Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY5IKQZ3FD%2F20240910%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=a36ec18d4a9140dc350b0d4a0eb4440e3ea8e4563a3357aefa2569cafa23550b&hash=99dd9adee29cfbbe7252e6030a56fc2b84fa1c3ad3b8598e30f0085a389b2c6c&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2352340924005584&tid=spdf-a4150106-d0f3-4789-84a7-075ca1d600ad&sid=0873a5bd51e572450a0af2f40180fda8c32fgxrqb&type=client&tsoh=d3d3LnNjaWVuY2VkaXJlY3QuY29t&ua=1f055a03570055555c&rr=8c0d403c08499e5a&cc=qa", + "Script": "Arab", + "Tokenized": "No", + "Host": "Mendeley Data", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "machine translation, sentiment analysis, word sense disambiguation, text classification, spam detection", + "Venue Title": "Data in Brief", + "Citations": "nan", + "Venue Type": "journal", + "Venue Name": "Data in Brief", + "Authors": "Sanaa Kaddoura, Reem Nassar", + "Affiliations": "Zayed University, UAE", + "Abstract": "This data paper introduces a comprehensive dataset tailored for word sense disambiguation tasks, explicitly focusing on a hundred polysemous words frequently employed in Modern Standard Arabic. The dataset encompasses 367 unique senses, each accompanied by contextual sentences comprising ten sentence examples that feature the polysemous word in various contexts. The dataset was meticulously collected from various web sources and supplemented with synthetic sentences generated by GPT3.5-turbo, addressing instances where rare senses lacked sufficient real-world data. The dataset provides a valuable resource for Arabic Natural Language Processing (NLP) tasks and is freely available for use in research.", + "Added By": "Maryam Al Emadi" +} \ No newline at end of file diff --git a/datasets/x-csr.json b/datasets/x-csr.json new file mode 100644 index 0000000..d428d05 --- /dev/null +++ b/datasets/x-csr.json @@ -0,0 +1,36 @@ +{ + "Name": "X-CSR", + "Subsets": [], + "HF Link": "https://hf.co/datasets/INK-USC/xcsr", + "Link": "https://inklab.usc.edu//XCSR/xcsr_datasets", + "License": "unknown", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "automatically translate the original CSQA and CODAH datasets, which only have English versions, to 15 other languages, forming development and test sets for studying X-CSR", + "Volume": "1,300", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Southern California", + "Derived From": "nan", + "Paper Title": "Common Sense Beyond English: Evaluating and Improving Multilingual Language Models for Commonsense Reasoning", + "Paper Link": "https://arxiv.org/pdf/2106.06937.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "commonsense reasoning", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Bill Yuchen Lin, Seyeon Lee, Xiaoyang Qiao, Xiang Ren\n", + "Affiliations": "Department of Computer Science and Information Sciences Institute, University of Southern California", + "Abstract": "Commonsense reasoning research has so far\nbeen mainly limited to English. We aim\nto evaluate and improve popular multilingual\nlanguage models (ML-LMs) to help advance\ncommonsense reasoning (CSR) beyond English. We collect the Mickey corpus, consisting of 561k sentences in 11 different languages, which can be used for analyzing and\nimproving ML-LMs. We propose Mickey\nProbe, a language-agnostic probing task for\nfairly evaluating the common sense of popular ML-LMs across different languages. Also,\nwe create two new datasets, X-CSQA and XCODAH, by translating their English versions\nto 15 other languages, so that we can evaluate\npopular ML-LMs for cross-lingual commonsense reasoning. To improve the performance\nbeyond English, we propose a simple yet effective method \u2014 multilingual contrastive pretraining (MCP). It significantly enhances sentence representations, yielding a large performance gain on both benchmarks (e.g., +2.7%\naccuracy for X-CSQA over XLM-RL).", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/xcsr.json b/datasets/xcsr.json new file mode 100644 index 0000000..4b17e98 --- /dev/null +++ b/datasets/xcsr.json @@ -0,0 +1,49 @@ +{ + "Name": "xcsr", + "Subsets": [ + { + "Name": "X-CSQA", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Volume": "2,074", + "Unit": "sentences" + }, + { + "Name": "X-CODAH:", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Volume": "1,300", + "Unit": "sentences" + } + ], + "HF Link": "https://hf.co/datasets/xcsr", + "Link": "https://hf.co/datasets/xcsr", + "License": "unknown", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "To evaluate multi-lingual language models (ML-LMs) for commonsense reasoning in a cross-lingual zero-shot transfer setting (X-CSR)", + "Volume": "3,374", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "University of Southern California", + "Derived From": "nan", + "Paper Title": "Common Sense Beyond English: Evaluating and Improving Multilingual Language Models for Commonsense Reasoning", + "Paper Link": "https://aclanthology.org/2021.acl-long.102.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "commonsense reasoning", + "Venue Title": "ACL", + "Citations": "13.0", + "Venue Type": "conference", + "Venue Name": "Associations of computation linguistics", + "Authors": "Bill Yuchen Lin, Seyeon Lee, Xiaoyang Qiao, Xiang Ren", + "Affiliations": "University of Southern California", + "Abstract": "Commonsense reasoning research has so far been limited to English. We aim to evaluate and improve popular multilingual language models (ML-LMs) to help advance commonsense reasoning (CSR) beyond English. We collect the Mickey corpus, consisting of 561k sentences in 11 different languages, which can be used for analyzing and improving ML-LMs. We propose Mickey Probe, a language-general probing task for fairly evaluating the common sense of popular ML-LMs across different languages. In addition, we also create two new datasets, X-CSQA and X-CODAH, by translating their English versions to 14 other languages, so that we can evaluate popular ML-LMs for cross-lingual commonsense reasoning. To improve the performance beyond English, we propose a simple yet effective method \u2014 multilingual contrastive pretraining (MCP). It significantly enhances sentence representations, yielding a large performance gain on both benchmarks (e.g., +2.7% accuracy for X-CSQA over XLM-R_L).", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/xglue.json b/datasets/xglue.json new file mode 100644 index 0000000..ebcccd1 --- /dev/null +++ b/datasets/xglue.json @@ -0,0 +1,36 @@ +{ + "Name": "XGLUE", + "Subsets": [], + "HF Link": "https://hf.co/datasets/microsoft/xglue", + "Link": "https://github.com/microsoft/XGLUE", + "License": "CC BY 4.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "XGLUE is a new benchmark dataset to evaluate the performance of cross-lingual pre-trained models with respect to cross-lingual natural language understanding and generation. The training data of each task is in English while the validation and test data is present in multiple different languages. The following table shows which languages are present as validation and test data for each config.", + "Volume": "10,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Microsoft", + "Derived From": "Universal Dependencies, MLQA, XNLI", + "Paper Title": "XGLUE: A New Benchmark Dataset for Cross-lingual Pre-training, Understanding and Generation", + "Paper Link": "https://arxiv.org/pdf/2004.01401.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "part of speech tagging, question answering, natural language inference", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Yaobo Liang, Nan Duan, Yeyun Gong, Ning Wu, Fenfei Guo, Weizhen Qi, Ming Gong, Linjun Shou,Daxin Jiang, Guihong Cao, Xiaodong Fan, Ruofei Zhang, Rahul Agrawal, Edward Cui, Sining Wei, Taroon Bharti,Ying Qiao, Jiun-Hung Chen, Winnie Wu, Shuguang Liu, Fan Yang, Daniel Campos, Rangan Majumder, Ming Zho", + "Affiliations": "microsoft ", + "Abstract": "\nIn this paper, we introduce XGLUE, a new\nbenchmark dataset that can be used to train\nlarge-scale cross-lingual pre-trained models\nusing multilingual and bilingual corpora and\nevaluate their performance across a diverse set\nof cross-lingual tasks. Comparing to GLUE\n(Wang et al., 2019), which is labeled in English for natural language understanding tasks\nonly, XGLUE has two main advantages: (1)\nit provides 11 diversified tasks that cover both\nnatural language understanding and generation\nscenarios; (2) for each task, it provides labeled\ndata in multiple languages. We extend a recent cross-lingual pre-trained model Unicoder\n(Huang et al., 2019) to cover both understanding and generation tasks, which is evaluated on\nXGLUE as a strong baseline. We also evaluate the base versions (12-layer) of Multilingual\nBERT, XLM and XLM-R for comparison", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/xl-headtags.json b/datasets/xl-headtags.json new file mode 100644 index 0000000..6d61da1 --- /dev/null +++ b/datasets/xl-headtags.json @@ -0,0 +1,36 @@ +{ + "Name": "XL-HeadTags", + "Subsets": [], + "HF Link": "https://hf.co/datasets/faisaltareque/XL-HeadTags", + "Link": "https://hf.co/datasets/faisaltareque/XL-HeadTags", + "License": "CC BY-SA 4.0", + "Year": 2024, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "news articles", + "Form": "text", + "Collection Style": "crawling", + "Description": "We provide XL-HeadTags, a large-scale news headline and tags generation dataset. The dataset consists of 20 languages across six diverse language families. It contains 415K news headline-article pairs with auxiliary information such as image captions, topic words", + "Volume": "6,922", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the Multilingual Generation of News Headlines and Tags", + "Paper Link": "https://aclanthology.org/2024.findings-acl.771.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "headline generation", + "Venue Title": "ACL Findings", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Findings of Associations of computation linguistics", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Millions of news articles published online daily\ncan overwhelm readers. Headlines and entity\n(topic) tags are essential for guiding readers to\ndecide if the content is worth their time. While\nheadline generation has been extensively studied, tag generation remains largely unexplored,\nyet it offers readers better access to topics of\ninterest. The need for conciseness in capturing\nreaders\u2019 attention necessitates improved content selection strategies for identifying salient\nand relevant segments within lengthy articles,\nthereby guiding language models effectively.\nTo address this, we propose to leverage auxiliary information such as images and captions\nembedded in the articles to retrieve relevant sentences and utilize instruction tuning with variations to generate both headlines and tags for\nnews articles in a multilingual context. To make\nuse of the auxiliary information, we have compiled a dataset named XL-HeadTags, which\nincludes 20 languages across 6 diverse language families. Through extensive evaluation,\nwe demonstrate the effectiveness of our plugand-play multimodal-multilingual retrievers for\nboth tasks. Additionally, we have developed a\nsuite of tools for processing and evaluating multilingual texts, significantly contributing to the\nresearch community by enabling more accurate\nand efficient analysis across languages", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/xlel_wd.json b/datasets/xlel_wd.json new file mode 100644 index 0000000..fba70c1 --- /dev/null +++ b/datasets/xlel_wd.json @@ -0,0 +1,36 @@ +{ + "Name": "xlel_wd", + "Subsets": [], + "HF Link": "https://hf.co/datasets/adithya7/xlel_wd", + "Link": "https://hf.co/datasets/adithya7/xlel_wd", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "XLEL-WD is a multilingual event linking dataset. This dataset contains mention references in multilingual Wikipedia/Wikinews articles to event items from Wikidata. The descriptions for Wikidata event items are taken from the corresponding Wikipedia articles.", + "Volume": "10,947", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "Carnegie Mellon University", + "Derived From": "nan", + "Paper Title": "Multilingual Event Linking to Wikidata", + "Paper Link": "https://arxiv.org/pdf/2204.06535.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "multilingual linking, cross-lingual linking", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Adithya Pratapa, Rishubh Gupta, Teruko Mitamura", + "Affiliations": "Carnegie Mellon University", + "Abstract": "We present a task of multilingual linking of events to a knowledge base. We automatically compile a large-scale dataset for this task, comprising of 1.8M mentions across 44 languages referring to over 10.9K events from Wikidata. We propose two variants of the event linking task: 1) multilingual, where event descriptions are from the same language as the mention, and 2) crosslingual, where all event descriptions are in English. On the two proposed tasks, we compare multiple event linking systems including BM25+ (Lv and Zhai, 2011) and multilingual adaptations of the biencoder and crossencoder architectures from BLINK (Wu et al., 2020). In our experiments on the two task variants, we find both biencoder and crossencoder models significantly outperform the BM25+ baseline. Our results also indicate that the crosslingual task is in general more challenging than the multilingual task. To test the out-of-domain generalization of the proposed linking systems, we additionally create a Wikinews-based evaluation set. We present qualitative analysis highlighting various aspects captured by the proposed dataset, including the need for temporal reasoning over context and tackling diverse event descriptions across languages.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/xlel_wd_dictionary.json b/datasets/xlel_wd_dictionary.json new file mode 100644 index 0000000..2811fe7 --- /dev/null +++ b/datasets/xlel_wd_dictionary.json @@ -0,0 +1,36 @@ +{ + "Name": "xlel_wd_dictionary", + "Subsets": [], + "HF Link": "https://hf.co/datasets/adithya7/xlel_wd_dictionary", + "Link": "https://hf.co/datasets/adithya7/xlel_wd_dictionary", + "License": "CC BY 4.0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-CLS: (Arabic (Classic))", + "Domain": "wikipedia", + "Form": "text", + "Collection Style": "crawling", + "Description": "XLEL-WD is a multilingual event linking dataset. This supplementary dataset contains a dictionary of event items from Wikidata. The descriptions for Wikidata event items are taken from the corresponding multilingual Wikipedia articles.", + "Volume": "114,834", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Carnegie Mellon University", + "Derived From": "nan", + "Paper Title": "Multilingual Event Linking to Wikidata", + "Paper Link": "https://arxiv.org/pdf/2204.06535.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "event linking", + "Venue Title": "nan", + "Citations": "nan", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Adithya Pratapa, Rishubh Gupta, Teruko Mitamura", + "Affiliations": "Carnegie Mellon University", + "Abstract": "We present a task of multilingual linking of events to a knowledge base. We automatically compile a large-scale dataset for this task, comprising of 1.8M mentions across 44 languages referring to over 10.9K events from Wikidata. We propose two variants of the event linking task: 1) multilingual, where event descriptions are from the same language as the mention, and 2) crosslingual, where all event descriptions are in English. On the two proposed tasks, we compare multiple event linking systems including BM25+ (Lv and Zhai, 2011) and multilingual adaptations of the biencoder and crossencoder architectures from BLINK (Wu et al., 2020). In our experiments on the two task variants, we find both biencoder and crossencoder models significantly outperform the BM25+ baseline. Our results also indicate that the crosslingual task is in general more challenging than the multilingual task. To test the out-of-domain generalization of the proposed linking systems, we additionally create a Wikinews-based evaluation set. We present qualitative analysis highlighting various aspects captured by the proposed dataset, including the need for temporal reasoning over context and tackling diverse event descriptions across languages.", + "Added By": "Khalid N. Elmadani" +} \ No newline at end of file diff --git a/datasets/xnli.json b/datasets/xnli.json new file mode 100644 index 0000000..ca3c5ff --- /dev/null +++ b/datasets/xnli.json @@ -0,0 +1,36 @@ +{ + "Name": "XNLI", + "Subsets": [], + "HF Link": "https://hf.co/datasets/facebook/xnli", + "Link": "https://github.com/facebookresearch/XNLI", + "License": "CC BY-NC 4.0", + "Year": 2018, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "evaluation set for XLU\r\nby extending the development and test sets of\r\nthe Multi-Genre Natural Language Inference\r\nCorpus (MultiNLI) to 15 languages,", + "Volume": "7,500", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Facebook, NYU", + "Derived From": "nan", + "Paper Title": "XNLI: Evaluating Cross-lingual Sentence Representations\r", + "Paper Link": "https://arxiv.org/pdf/1809.05053.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "natural language inference", + "Venue Title": "EMNLP", + "Citations": "354.0", + "Venue Type": "conference", + "Venue Name": "Conference on Empirical Methods in Natural Language Processing", + "Authors": "Alexis Conneau,Guillaume Lample,Ruty Rinott,Adina Williams,Samuel R. Bowman,Holger Schwenk,Veselin Stoyanov", + "Affiliations": ",,,Facebook AI Research,New York University,,", + "Abstract": "State-of-the-art natural language processing systems rely on supervision in the form of annotated data to learn competent models. These models are generally trained on data in a single language (usually English), and cannot be directly used beyond that language. Since collecting data in every language is not realistic, there has been a growing interest in cross-lingual language understanding (XLU) and low-resource cross-language transfer. In this work, we construct an evaluation set for XLU by extending the development and test sets of the Multi-Genre Natural Language Inference Corpus (MultiNLI) to 14 languages, including low-resource languages such as Swahili and Urdu. We hope that our dataset, dubbed XNLI, will catalyze research in cross-lingual sentence understanding by providing an informative standard evaluation task. In addition, we provide several baselines for multilingual sentence understanding, including two based on machine translation systems, and two that use parallel data to train aligned multilingual bag-of-words and LSTM encoders. We find that XNLI represents a practical and challenging evaluation suite, and that directly translating the test data yields the best performance among available baselines.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/xor-tydi_qa.json b/datasets/xor-tydi_qa.json new file mode 100644 index 0000000..8d19cd8 --- /dev/null +++ b/datasets/xor-tydi_qa.json @@ -0,0 +1,36 @@ +{ + "Name": "XOR-TyDi QA", + "Subsets": [], + "HF Link": "https://hf.co/datasets/akariasai/xor_tydi_qa", + "Link": "https://nlp.cs.washington.edu/xorqa/index.html", + "License": "CC BY-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "XOR-TyDi QA brings together for the first time information-seeking questions, open-retrieval QA, and multilingual QA to create a multilingual open-retrieval QA dataset that enables cross-lingual answer retrieval. It consists of questions written by information-seeking native speakers in 7 typologically diverse languages and answer annotations that are retrieved from multilingual document collections.", + "Volume": "5,235", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Multiple Institutions ", + "Derived From": "TYDIQA", + "Paper Title": "XOR QA: Cross-lingual Open-Retrieval Question Answering", + "Paper Link": "https://arxiv.org/pdf/2010.11856.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "other", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "open-retrieval question answering", + "Venue Title": "arXiv", + "Citations": "nan", + "Venue Type": "preprint", + "Venue Name": "nan", + "Authors": "Akari Asai, Jungo Kasai, Jonathan H. Clark,Kenton Lee, Eunsol Choi, Hannaneh Hajishirzi", + "Affiliations": "University of Washington, University of Washington, Google Research, The University of Texas at Austin; Allen Institute for AI", + "Abstract": "Multilingual question answering tasks typically assume that answers exist in the same\nlanguage as the question. Yet in practice, many languages face both information\nscarcity\u2014where languages have few reference\narticles\u2014and information asymmetry\u2014where\nquestions reference concepts from other cultures. This work extends open-retrieval question answering to a cross-lingual setting enabling questions from one language to be answered via answer content from another language. We construct a large-scale dataset\nbuilt on 40K information-seeking questions\nacross 7 diverse non-English languages that\nTYDI QA could not find same-language answers for. Based on this dataset, we introduce\na task framework, called Cross-lingual OpenRetrieval Question Answering (XOR QA),\nthat consists of three new tasks involving crosslingual document retrieval from multilingual\nand English resources. We establish baselines\nwith state-of-the-art machine translation systems and cross-lingual pretrained models. Experimental results suggest that XOR QA is a\nchallenging task that will facilitate the development of novel techniques for multilingual\nquestion answering. Our data and code are\navailable at https://nlp.cs.washington.\nedu/xorqa/.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/xp3all.json b/datasets/xp3all.json new file mode 100644 index 0000000..73698e9 --- /dev/null +++ b/datasets/xp3all.json @@ -0,0 +1,36 @@ +{ + "Name": "xp3all", + "Subsets": [], + "HF Link": "https://hf.co/datasets/bigscience/xP3all", + "Link": "https://hf.co/datasets/bigscience/xP3all", + "License": "Apache-2.0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot", + "Volume": "2,610,000", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "BigScience", + "Derived From": "nan", + "Paper Title": "Crosslingual Generalization through Multitask Finetuning", + "Paper Link": "https://aclanthology.org/2023.acl-long.891.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "HuggingFace", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "instruction tuning", + "Venue Title": "ACL", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Associations of computation linguistics", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "Multitask prompted finetuning (MTF) has been\nshown to help large language models generalize to new tasks in a zero-shot setting, but\nso far explorations of MTF have focused on\nEnglish data and models. We apply MTF to\nthe pretrained multilingual BLOOM and mT5\nmodel families to produce finetuned variants\ncalled BLOOMZ and mT0. We find finetuning\nlarge multilingual language models on English\ntasks with English prompts allows for task generalization to non-English languages that appear only in the pretraining corpus. Finetuning on multilingual tasks with English prompts\nfurther improves performance on English and\nnon-English tasks leading to various state-ofthe-art zero-shot results. We also investigate\nfinetuning on multilingual tasks with prompts\nthat have been machine-translated from English to match the language of each dataset.\nWe find training on these machine-translated\nprompts leads to better performance on humanwritten prompts in the respective languages.\nSurprisingly, we find models are capable of\nzero-shot generalization to tasks in languages\nthey have never intentionally seen. We conjecture that the models are learning higher-level\ncapabilities that are both task- and languageagnostic. In addition, we introduce xP3, a\ncomposite of supervised datasets in 46 languages with English and machine-translated\nprompts. Our code, datasets and models\nare freely available at https://github.com/\nbigscience-workshop/xmtf", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/xquad.json b/datasets/xquad.json new file mode 100644 index 0000000..a05e8f3 --- /dev/null +++ b/datasets/xquad.json @@ -0,0 +1,36 @@ +{ + "Name": "xquad", + "Subsets": [], + "HF Link": "https://hf.co/datasets/google/xquad", + "Link": "https://github.com/deepmind/xquad", + "License": "CC BY-SA 4.0", + "Year": 2019, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi. Consequently, the dataset is entirely parallel across 11 languages.", + "Volume": "1,190", + "Unit": "documents", + "Ethical Risks": "Low", + "Provider": "DeepMind", + "Derived From": "SQuAD", + "Paper Title": "On the Cross-lingual Transferability of Monolingual Representations", + "Paper Link": "https://aclanthology.org/2020.acl-main.421.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "question answering", + "Venue Title": "ACL", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Association of Computation Linguistics", + "Authors": "Mikel Artetxe\u2020, Sebastian Ruder, Dani Yogatama\n", + "Affiliations": "HiTZ Center, University of the Basque Country; DeepMind, DeepMind", + "Abstract": "State-of-the-art unsupervised multilingual\nmodels (e.g., multilingual BERT) have been\nshown to generalize in a zero-shot crosslingual setting. This generalization ability has\nbeen attributed to the use of a shared subword\nvocabulary and joint training across multiple\nlanguages giving rise to deep multilingual\nabstractions. We evaluate this hypothesis by\ndesigning an alternative approach that transfers a monolingual model to new languages\nat the lexical level. More concretely, we first\ntrain a transformer-based masked language\nmodel on one language, and transfer it to a\nnew language by learning a new embedding\nmatrix with the same masked language\nmodeling objective\u2014freezing parameters\nof all other layers. This approach does not\nrely on a shared vocabulary or joint training.\nHowever, we show that it is competitive with\nmultilingual BERT on standard cross-lingual\nclassification benchmarks and on a new\nCross-lingual Question Answering Dataset\n(XQuAD). Our results contradict common\nbeliefs of the basis of the generalization ability\nof multilingual models and suggest that deep\nmonolingual models learn some abstractions\nthat generalize across languages. We also\nrelease XQuAD as a more comprehensive\ncross-lingual benchmark, which comprises\n240 paragraphs and 1190 question-answer\npairs from SQuAD v1.1 translated into ten\nlanguages by professional translators.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/xquad_r.json b/datasets/xquad_r.json new file mode 100644 index 0000000..a6dea37 --- /dev/null +++ b/datasets/xquad_r.json @@ -0,0 +1,36 @@ +{ + "Name": "xquad_r", + "Subsets": [], + "HF Link": "https://hf.co/datasets/xquad_r", + "Link": "https://github.com/google-research-datasets/lareqa", + "License": "CC BY 4.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "mixed", + "Domain": "other", + "Form": "text", + "Collection Style": "machine translation", + "Description": "XQuAD-R is a retrieval version of the XQuAD dataset (a cross-lingual extractive QA dataset). Like XQuAD, XQUAD-R is an 11-way parallel dataset, where each question appears in 11 different languages and has 11 parallel correct answers across the languages", + "Volume": "1,190", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Google Research", + "Derived From": "XQuAD dataset", + "Paper Title": "LAReQA: Language-agnostic answer retrieval from a multilingual pool", + "Paper Link": "https://arxiv.org/pdf/2004.05484.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language-agnostic answer retrieval from a multilingual candidate pool", + "Venue Title": "nan", + "Citations": "21.0", + "Venue Type": "nan", + "Venue Name": "nan", + "Authors": "Uma Roy, Noah Constant, Rami Al-Rfou, Aditya Barua, Aaron Phillips, Yinfei Yang", + "Affiliations": "Google Research", + "Abstract": "We present LAReQA, a challenging new benchmark for language-agnostic answer retrieval from a multilingual candidate pool. Unlike previous cross-lingual tasks, LAReQA tests for \u201cstrong\u201d cross-lingual alignment, requiring semantically related cross-language pairs to be closer in representation space than unrelated same-language pairs. Building on multilingual BERT (mBERT), we study different strategies for achieving strong alignment. We find that augmenting training data via machine translation is effective, and improves significantly over using mBERT out-of-the-box. Interestingly, the embedding baseline that performs the best on LAReQA falls short of competing baselines on zero-shot variants of our task that only target \u201cweak\u201d alignment. This finding underscores our claim that language agnostic retrieval is a substantively new kind of cross-lingual evaluation.", + "Added By": "Wafaa Mohammed" +} \ No newline at end of file diff --git a/datasets/xsid_-_(x)_slot_and_intent_detection.json b/datasets/xsid_-_(x)_slot_and_intent_detection.json new file mode 100644 index 0000000..c16bc40 --- /dev/null +++ b/datasets/xsid_-_(x)_slot_and_intent_detection.json @@ -0,0 +1,36 @@ +{ + "Name": "xSID - (X) Slot and Intent Detection", + "Subsets": [], + "HF Link": "https://hf.co/datasets/SEACrowd/xsid", + "Link": "https://github.com/mainlp/xsid", + "License": "CC BY-SA 4.0", + "Year": 2021, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "human translation", + "Description": "An evaluation dataset of intent classification and slot detection", + "Volume": "800", + "Unit": "tokens", + "Ethical Risks": "Low", + "Provider": "nan", + "Derived From": "nan", + "Paper Title": "From Masked Language Modeling to Translation: Non-English Auxiliary Tasks Improve Zero-shot Spoken Language Understanding", + "Paper Link": "https://aclanthology.org/2021.naacl-main.197.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "intent classification, slot detection", + "Venue Title": "ACL", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "Associations of computation linguistics", + "Authors": "nan", + "Affiliations": "nan", + "Abstract": "The lack of publicly available evaluation data for low-resource languages limits progress in Spoken Language Understanding (SLU). As key tasks like intent classification and slot filling require abundant training data, it is desirable to reuse existing data in high-resource languages to develop models for low-resource scenarios. We introduce XSID, a new benchmark for cross-lingual (X) Slot and Intent Detection in 13 languages from 6 language families, including a very low-resource dialect. To tackle the challenge, we propose a joint learning approach, with English SLU training data and non-English auxiliary tasks from raw text, syntax and translation for transfer. We study two setups which differ by type and language coverage of the pre-trained embeddings. Our results show that jointly learning the main tasks with masked language modeling is effective for slots, while machine translation transfer works best for intent classification", + "Added By": "Amr Keleg" +} \ No newline at end of file diff --git a/datasets/xtreme.json b/datasets/xtreme.json new file mode 100644 index 0000000..8d33060 --- /dev/null +++ b/datasets/xtreme.json @@ -0,0 +1,36 @@ +{ + "Name": "XTREME", + "Subsets": [], + "HF Link": "https://hf.co/datasets/google/xtreme", + "Link": "https://github.com/google-research/xtreme", + "License": "Apache-2.0", + "Year": 2020, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "other", + "Description": "contains many datasets from different benchmarks like XNLI, TYDiQA, etc. ", + "Volume": "nan", + "Unit": "sentences", + "Ethical Risks": "Low", + "Provider": "Google", + "Derived From": "Tydiqa, xnli, etc.", + "Paper Title": "XTREME: A Massively Multilingual Multi-task Benchmark\nfor Evaluating Cross-lingual Generalization\n", + "Paper Link": "https://arxiv.org/pdf/2003.11080.pdf", + "Script": "Arab", + "Tokenized": "No", + "Host": "GitHub", + "Access": "Free", + "Cost": "nan", + "Test Split": "Yes", + "Tasks": "natural language inference,part of speech tagging,named entity recognition,question answering,machine translation", + "Venue Title": "ICML", + "Citations": "209.0", + "Venue Type": "conference", + "Venue Name": "International Conference on Machine Learning ", + "Authors": "Junjie Hu,Sebastian Ruder,Aditya Siddhant,Graham Neubig,Orhan Firat,M. Johnson", + "Affiliations": ",DeepMind,,,,", + "Abstract": "Much recent progress in applications of machine learning models to NLP has been driven by benchmarks that evaluate models across a wide variety of tasks. However, these broad-coverage benchmarks have been mostly limited to English, and despite an increasing interest in multilingual models, a benchmark that enables the comprehensive evaluation of such methods on a diverse range of languages and tasks is still missing. To this end, we introduce the Cross-lingual TRansfer Evaluation of Multilingual Encoders XTREME benchmark, a multi-task benchmark for evaluating the cross-lingual generalization capabilities of multilingual representations across 40 languages and 9 tasks. We demonstrate that while models tested on English reach human performance on many tasks, there is still a sizable gap in the performance of cross-lingually transferred models, particularly on syntactic and sentence retrieval tasks. There is also a wide spread of results across languages. We release the benchmark to encourage research on cross-lingual learning methods that transfer linguistic knowledge across a diverse and representative set of languages and tasks.", + "Added By": "Zaid Alyafeai" +} \ No newline at end of file diff --git a/datasets/zaebuc.json b/datasets/zaebuc.json new file mode 100644 index 0000000..3d8e92a --- /dev/null +++ b/datasets/zaebuc.json @@ -0,0 +1,36 @@ +{ + "Name": "ZAEBUC", + "Subsets": [], + "HF Link": "nan", + "Link": "https://sites.google.com/view/zaebuc/home", + "License": "CC BY-NC 4.0", + "Year": 2022, + "Language": "multilingual", + "Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))", + "Domain": "other", + "Form": "text", + "Collection Style": "manual curation", + "Description": "the corpus is an annotated Arabic-English bilingual writer corpus comprising short essays by first-year university students at Zayed University in the United Arab Emirates.", + "Volume": "33,300", + "Unit": "tokens", + "Ethical Risks": "Medium", + "Provider": "NYU Abu Dhabi, Zayed University, UAE", + "Derived From": "nan", + "Paper Title": "ZAEBUC: An annotated Arabic-English bilingual writer corpus", + "Paper Link": "http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.9.pdf", + "Script": "Latn", + "Tokenized": "Yes", + "Host": "other", + "Access": "Upon-Request", + "Cost": "nan", + "Test Split": "No", + "Tasks": "language modeling, language identification, morphological analysis, text error analysis", + "Venue Title": "LREC", + "Citations": "nan", + "Venue Type": "conference", + "Venue Name": "European Language Resources Association (ELRA)", + "Authors": "Nizar Habash, David Palfreyman", + "Affiliations": "New York University Abu Dhabi, Abu Dhabi, UAE, Zayed University, Abu Dhabi, UAE", + "Abstract": "We present ZAEBUC, an annotated Arabic-English bilingual writer corpus comprising short essays by first-year university students at Zayed University in the United Arab Emirates. We describe and discuss the various guidelines and pipeline processes we followed to create the annotations and quality check them. The annotations include spelling and grammar correction, morphological tokenization, Part-of-Speech tagging, lemmatization, and Common European Framework of Reference (CEFR) ratings. All of the annotations are done on Arabic and English texts using consistent guidelines as much as possible, with tracked alignments among the different annotations, and to the original raw texts. For morphological tokenization, POS tagging, and lemmatization, we use existing automatic annotation tools followed by manual correction. We also present various measurements and correlations with preliminary insights drawn from the data and annotations. The publicly available ZAEBUC corpus and its annotations are intended to be the stepping stones for additional annotations.", + "Added By": "Jezia Zakraoui" +} \ No newline at end of file