diff --git a/.gitignore b/.gitignore index 31afbff..6e5f8b6 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,7 @@ dmypy.json # Pyre type checker .pyre/ +.DS_Store + +dumped* +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index 99ad3d0..ad8507b 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ This script translate the samsum dataset using the inference server python main.py ``` + ## Translate ```shell @@ -47,3 +48,11 @@ python -m instructmultilingual.translate \ --source_language="English" \ --target_language="Egyptian Arabic" ``` + + +### Automatic Data Generation + +Run the following script. +``` +bash scripts/validate_and_generate.sh +``` diff --git a/data/MT5_2_NLLB_ONE_TO_MANY.json b/data/MT5_2_NLLB_ONE_TO_MANY.json new file mode 100644 index 0000000..8dda6c4 --- /dev/null +++ b/data/MT5_2_NLLB_ONE_TO_MANY.json @@ -0,0 +1,670 @@ +{ + "af": [ + [ + "afr_Latn", + "Afrikaans" + ] + ], + "sq": [ + [ + "als_Latn", + "Tosk Albanian" + ] + ], + "am": [ + [ + "amh_Ethi", + "Amharic" + ] + ], + "ar": [ + [ + "ace_Arab", + "Acehnese (Arabic script)" + ], + [ + "acm_Arab", + "Mesopotamian Arabic" + ], + [ + "acq_Arab", + "Ta\u2019izzi-Adeni Arabic" + ], + [ + "aeb_Arab", + "Tunisian Arabic" + ], + [ + "ajp_Arab", + "South Levantine Arabic" + ], + [ + "apc_Arab", + "North Levantine Arabic" + ], + [ + "arb_Arab", + "Modern Standard Arabic" + ], + [ + "arb_Latn", + "Modern Standard Arabic (Romanized)" + ], + [ + "ars_Arab", + "Najdi Arabic" + ], + [ + "ary_Arab", + "Moroccan Arabic" + ], + [ + "arz_Arab", + "Egyptian Arabic" + ], + [ + "bjn_Arab", + "Banjar (Arabic script)" + ], + [ + "kas_Arab", + "Kashmiri (Arabic script)" + ], + [ + "knc_Arab", + "Central Kanuri (Arabic script)" + ], + [ + "min_Arab", + "Minangkabau (Arabic script)" + ] + ], + "hy": [ + [ + "hye_Armn", + "Armenian" + ] + ], + "az": [ + [ + "azb_Arab", + "South Azerbaijani" + ], + [ + "azj_Latn", + "North Azerbaijani" + ] + ], + "eu": [ + [ + "eus_Latn", + "Basque" + ] + ], + "be": [ + [ + "bel_Cyrl", + "Belarusian" + ] + ], + "bn": [ + [ + "ben_Beng", + "Bengali" + ], + [ + "mni_Beng", + "Meitei (Bengali script)" + ] + ], + "bg": [ + [ + "bul_Cyrl", + "Bulgarian" + ] + ], + "my": [ + [ + "mya_Mymr", + "Burmese" + ] + ], + "ca": [ + [ + "cat_Latn", + "Catalan" + ] + ], + "ceb": [ + [ + "ceb_Latn", + "Cebuano" + ] + ], + "zh": [ + [ + "yue_Hant", + "Yue Chinese" + ], + [ + "zho_Hans", + "Chinese (Simplified)" + ], + [ + "zho_Hant", + "Chinese (Traditional)" + ] + ], + "cs": [ + [ + "ces_Latn", + "Czech" + ] + ], + "da": [ + [ + "dan_Latn", + "Danish" + ] + ], + "nl": [ + [ + "nld_Latn", + "Dutch" + ] + ], + "en": [ + [ + "eng_Latn", + "English" + ] + ], + "eo": [ + [ + "epo_Latn", + "Esperanto" + ] + ], + "et": [ + [ + "est_Latn", + "Estonian" + ] + ], + "fi": [ + [ + "fin_Latn", + "Finnish" + ] + ], + "fr": [ + [ + "fra_Latn", + "French" + ] + ], + "gl": [ + [ + "glg_Latn", + "Galician" + ] + ], + "ka": [ + [ + "kat_Geor", + "Georgian" + ] + ], + "de": [ + [ + "deu_Latn", + "German" + ] + ], + "el": [ + [ + "ell_Grek", + "Greek" + ] + ], + "gu": [ + [ + "guj_Gujr", + "Gujarati" + ] + ], + "ht": [ + [ + "hat_Latn", + "Haitian Creole" + ] + ], + "ha": [ + [ + "hau_Latn", + "Hausa" + ] + ], + "iw": [ + [ + "heb_Hebr", + "Hebrew" + ] + ], + "hi": [ + [ + "hin_Deva", + "Hindi" + ] + ], + "hu": [ + [ + "hun_Latn", + "Hungarian" + ] + ], + "is": [ + [ + "isl_Latn", + "Icelandic" + ] + ], + "ig": [ + [ + "ibo_Latn", + "Igbo" + ] + ], + "id": [ + [ + "ind_Latn", + "Indonesian" + ] + ], + "ga": [ + [ + "gle_Latn", + "Irish" + ] + ], + "it": [ + [ + "ita_Latn", + "Italian" + ] + ], + "ja": [ + [ + "jpn_Jpan", + "Japanese" + ] + ], + "jv": [ + [ + "jav_Latn", + "Javanese" + ] + ], + "kn": [ + [ + "kan_Knda", + "Kannada" + ] + ], + "kk": [ + [ + "kaz_Cyrl", + "Kazakh" + ] + ], + "km": [ + [ + "khm_Khmr", + "Khmer" + ] + ], + "ko": [ + [ + "kor_Hang", + "Korean" + ] + ], + "ku": [ + [ + "ckb_Arab", + "Central Kurdish" + ], + [ + "kmr_Latn", + "Northern Kurdish" + ] + ], + "ky": [ + [ + "kir_Cyrl", + "Kyrgyz" + ] + ], + "lo": [ + [ + "lao_Laoo", + "Lao" + ] + ], + "la": [ + [ + "ace_Latn", + "Acehnese (Latin script)" + ], + [ + "bjn_Latn", + "Banjar (Latin script)" + ], + [ + "knc_Latn", + "Central Kanuri (Latin script)" + ], + [ + "min_Latn", + "Minangkabau (Latin script)" + ], + [ + "taq_Latn", + "Tamasheq (Latin script)" + ] + ], + "lv": [ + [ + "lvs_Latn", + "Standard Latvian" + ] + ], + "lt": [ + [ + "lit_Latn", + "Lithuanian" + ] + ], + "lb": [ + [ + "ltz_Latn", + "Luxembourgish" + ] + ], + "mk": [ + [ + "mkd_Cyrl", + "Macedonian" + ] + ], + "mg": [ + [ + "plt_Latn", + "Plateau Malagasy" + ] + ], + "ms": [ + [ + "mal_Mlym", + "Malayalam" + ], + [ + "zsm_Latn", + "Standard Malay" + ] + ], + "ml": [ + [ + "mal_Mlym", + "Malayalam" + ] + ], + "mt": [ + [ + "mlt_Latn", + "Maltese" + ] + ], + "mi": [ + [ + "mri_Latn", + "Maori" + ] + ], + "mr": [ + [ + "mar_Deva", + "Marathi" + ] + ], + "mn": [ + [ + "khk_Cyrl", + "Halh Mongolian" + ] + ], + "ne": [ + [ + "npi_Deva", + "Nepali" + ] + ], + "no": [ + [ + "nno_Latn", + "Norwegian Nynorsk" + ], + [ + "nob_Latn", + "Norwegian Bokm\u00e5l" + ] + ], + "ps": [ + [ + "pbt_Arab", + "Southern Pashto" + ] + ], + "fa": [ + [ + "pes_Arab", + "Western Persian" + ] + ], + "pl": [ + [ + "pol_Latn", + "Polish" + ] + ], + "pt": [ + [ + "por_Latn", + "Portuguese" + ] + ], + "ro": [ + [ + "ron_Latn", + "Romanian" + ] + ], + "ru": [ + [ + "rus_Cyrl", + "Russian" + ] + ], + "sm": [ + [ + "smo_Latn", + "Samoan" + ] + ], + "gd": [ + [ + "gla_Latn", + "Scottish Gaelic" + ] + ], + "sr": [ + [ + "srp_Cyrl", + "Serbian" + ] + ], + "sn": [ + [ + "sna_Latn", + "Shona" + ] + ], + "sd": [ + [ + "snd_Arab", + "Sindhi" + ] + ], + "si": [ + [ + "sin_Sinh", + "Sinhala" + ] + ], + "sk": [ + [ + "slk_Latn", + "Slovak" + ] + ], + "sl": [ + [ + "slv_Latn", + "Slovenian" + ] + ], + "so": [ + [ + "som_Latn", + "Somali" + ] + ], + "st": [ + [ + "nso_Latn", + "Northern Sotho" + ], + [ + "sot_Latn", + "Southern Sotho" + ] + ], + "es": [ + [ + "spa_Latn", + "Spanish" + ] + ], + "su": [ + [ + "sun_Latn", + "Sundanese" + ] + ], + "sw": [ + [ + "swh_Latn", + "Swahili" + ] + ], + "sv": [ + [ + "swe_Latn", + "Swedish" + ] + ], + "tg": [ + [ + "tgk_Cyrl", + "Tajik" + ] + ], + "ta": [ + [ + "tam_Taml", + "Tamil" + ] + ], + "te": [ + [ + "tel_Telu", + "Telugu" + ] + ], + "th": [ + [ + "tha_Thai", + "Thai" + ] + ], + "tr": [ + [ + "tur_Latn", + "Turkish" + ] + ], + "uk": [ + [ + "ukr_Cyrl", + "Ukrainian" + ] + ], + "ur": [ + [ + "urd_Arab", + "Urdu" + ] + ], + "uz": [ + [ + "uzn_Latn", + "Northern Uzbek" + ] + ], + "vi": [ + [ + "vie_Latn", + "Vietnamese" + ] + ], + "cy": [ + [ + "cym_Latn", + "Welsh" + ] + ], + "xh": [ + [ + "xho_Latn", + "Xhosa" + ] + ], + "yi": [ + [ + "ydd_Hebr", + "Eastern Yiddish" + ] + ], + "yo": [ + [ + "yor_Latn", + "Yoruba" + ] + ], + "zu": [ + [ + "zul_Latn", + "Zulu" + ] + ] +} \ No newline at end of file diff --git a/data/MT5_2_NLLB_ONE_TO_ONE.json b/data/MT5_2_NLLB_ONE_TO_ONE.json new file mode 100644 index 0000000..75cf7c9 --- /dev/null +++ b/data/MT5_2_NLLB_ONE_TO_ONE.json @@ -0,0 +1,95 @@ +{ + "af": "afr_Latn", + "sq": "als_Latn", + "am": "amh_Ethi", + "hy": "hye_Armn", + "eu": "eus_Latn", + "be": "bel_Cyrl", + "bg": "bul_Cyrl", + "my": "mya_Mymr", + "ca": "cat_Latn", + "ceb": "ceb_Latn", + "cs": "ces_Latn", + "da": "dan_Latn", + "nl": "nld_Latn", + "en": "eng_Latn", + "eo": "epo_Latn", + "et": "est_Latn", + "fi": "fin_Latn", + "fr": "fra_Latn", + "gl": "glg_Latn", + "ka": "kat_Geor", + "de": "deu_Latn", + "el": "ell_Grek", + "gu": "guj_Gujr", + "ht": "hat_Latn", + "ha": "hau_Latn", + "iw": "heb_Hebr", + "hi": "hin_Deva", + "hu": "hun_Latn", + "is": "isl_Latn", + "ig": "ibo_Latn", + "id": "ind_Latn", + "ga": "gle_Latn", + "it": "ita_Latn", + "ja": "jpn_Jpan", + "jv": "jav_Latn", + "kn": "kan_Knda", + "kk": "kaz_Cyrl", + "km": "khm_Khmr", + "ko": "kor_Hang", + "ky": "kir_Cyrl", + "lo": "lao_Laoo", + "lv": "lvs_Latn", + "lt": "lit_Latn", + "lb": "ltz_Latn", + "mk": "mkd_Cyrl", + "mg": "plt_Latn", + "ml": "mal_Mlym", + "mt": "mlt_Latn", + "mi": "mri_Latn", + "mr": "mar_Deva", + "mn": "khk_Cyrl", + "ne": "npi_Deva", + "ps": "pbt_Arab", + "fa": "pes_Arab", + "pl": "pol_Latn", + "pt": "por_Latn", + "ro": "ron_Latn", + "ru": "rus_Cyrl", + "sm": "smo_Latn", + "gd": "gla_Latn", + "sr": "srp_Cyrl", + "sn": "sna_Latn", + "sd": "snd_Arab", + "si": "sin_Sinh", + "sk": "slk_Latn", + "sl": "slv_Latn", + "so": "som_Latn", + "es": "spa_Latn", + "su": "sun_Latn", + "sw": "swh_Latn", + "sv": "swe_Latn", + "tg": "tgk_Cyrl", + "ta": "tam_Taml", + "te": "tel_Telu", + "th": "tha_Thai", + "tr": "tur_Latn", + "uk": "ukr_Cyrl", + "ur": "urd_Arab", + "uz": "uzn_Latn", + "vi": "vie_Latn", + "cy": "cym_Latn", + "xh": "xho_Latn", + "yi": "ydd_Hebr", + "yo": "yor_Latn", + "zu": "zul_Latn", + "ar": "arb_Arab", + "bn": "ben_Beng", + "zh": "zho_Hans", + "fil": "fil_Latn", + "hr": "hrv_Latn", + "he": "heb_Hebr", + "ms": "msa_Latn", + "no": "nno_Latn" +} \ No newline at end of file diff --git a/data/data_stat.py b/data/data_stat.py new file mode 100644 index 0000000..a7e3662 --- /dev/null +++ b/data/data_stat.py @@ -0,0 +1,397 @@ +import os +import csv +import json +import datasets +import argparse + +# huggingface dataset signature with configs +SERIES_A_DATASET_NAME_DICT = { + "udhr": { + # None: "mixed" + }, + "AmazonScience/mintaka": { + "ar": "ar", + "de": "de", + "en": "en", + "es": "es", + "fr": "fr", + "hi": "hi", + "it": "it", + "ja": "ja", + "pt": "pt", + }, + "xcsr": { + "X-CSQA-en": "en", + "X-CSQA-zh": "zh", + "X-CSQA-de": "de", + "X-CSQA-es": "es", + "X-CSQA-fr": "fr", + "X-CSQA-it": "it", + "X-CSQA-jap": "ja", + "X-CSQA-nl": "nl", + "X-CSQA-pl": "pl", + "X-CSQA-pt": "pt", + "X-CSQA-ru": "ru", + "X-CSQA-ar": "ar", + "X-CSQA-vi": "vi", + "X-CSQA-hi": "hi", + "X-CSQA-sw": "sw", + "X-CSQA-ur": "ur", + # 'X-CODAH-en': "en", + # 'X-CODAH-zh': "zh", + # 'X-CODAH-de': "de", + # 'X-CODAH-es': "es", + # 'X-CODAH-fr': "fr", + # 'X-CODAH-it': "it", + # 'X-CODAH-jap': "ja", + # 'X-CODAH-nl': "nl", + # 'X-CODAH-pl': "pl", + # 'X-CODAH-pt': "pt", + # 'X-CODAH-ru': "ru", + # 'X-CODAH-ar': "ar", + # 'X-CODAH-vi': "vi", + # 'X-CODAH-hi': "hi", + # 'X-CODAH-sw': "sw", + # 'X-CODAH-ur': "ur", + }, + "shmuhammad/AfriSenti-twitter-sentiment": { + "amh": "amh", + "hau": "hau", + "ibo": "ibo", + "arq": "arq", + "ary": "ary", + # 'yor':'yor', + "por": "por", + "twi": "twi", + "tso": "tso", + "tir": "tir", + "pcm": "pcm", + "kin": "kin", + "swa": "swa", + # 'orm': 'orm', + }, + "indonlp/NusaX-senti": { + "ace": "ace", + "ban": "ban", + "bjn": "bjn", + # 'bug':'bug', + "eng": "eng", + "ind": "ind", + # 'jav':'jav', + "mad": "mad", + "min": "min", + "nij": "nij", + "sun": "sun", + "bbc": "bbc", + }, + "masakhane/masakhanews": { + "amh": "amh", + "eng": "eng", + "fra": "fra", + "hau": "hau", + "ibo": "ibo", + "lin": "lin", + "lug": "lug", + "orm": "orm", + "pcm": "pcm", + "run": "run", + "sna": "sna", + "som": "som", + "swa": "swa", + "tir": "tir", + "xho": "xho", + "yor": "yor", + }, + "papluca/language-identification": { + # None: "mixed", + }, + "adithya7/xlel_wd": { + # 'wikipedia-zero-shot': "mixed", + # 'wikinews-zero-shot': "mixed", + # 'wikinews-cross-domain': "mixed", + "wikipedia-zero-shot.af": "af", + "wikipedia-zero-shot.ar": "ar", + "wikipedia-zero-shot.be": "be", + "wikipedia-zero-shot.bg": "bg", + "wikipedia-zero-shot.bn": "bn", + "wikipedia-zero-shot.ca": "ca", + "wikipedia-zero-shot.cs": "cs", + "wikipedia-zero-shot.da": "da", + "wikipedia-zero-shot.de": "de", + "wikipedia-zero-shot.el": "el", + "wikipedia-zero-shot.en": "en", + "wikipedia-zero-shot.es": "es", + "wikipedia-zero-shot.fa": "fa", + "wikipedia-zero-shot.fi": "fi", + "wikipedia-zero-shot.fr": "fr", + "wikipedia-zero-shot.he": "he", + "wikipedia-zero-shot.hi": "hi", + "wikipedia-zero-shot.hu": "hu", + "wikipedia-zero-shot.id": "id", + "wikipedia-zero-shot.it": "it", + "wikipedia-zero-shot.ja": "ja", + "wikipedia-zero-shot.ko": "ko", + "wikipedia-zero-shot.ml": "ml", + "wikipedia-zero-shot.mr": "mr", + "wikipedia-zero-shot.ms": "ms", + "wikipedia-zero-shot.nl": "nl", + "wikipedia-zero-shot.no": "no", + "wikipedia-zero-shot.pl": "pl", + "wikipedia-zero-shot.pt": "pt", + "wikipedia-zero-shot.ro": "ro", + "wikipedia-zero-shot.ru": "ru", + "wikipedia-zero-shot.si": "si", + "wikipedia-zero-shot.sk": "sk", + "wikipedia-zero-shot.sl": "sl", + "wikipedia-zero-shot.sr": "sr", + "wikipedia-zero-shot.sv": "sv", + "wikipedia-zero-shot.sw": "sw", + "wikipedia-zero-shot.ta": "ta", + "wikipedia-zero-shot.te": "te", + "wikipedia-zero-shot.th": "th", + "wikipedia-zero-shot.tr": "tr", + "wikipedia-zero-shot.uk": "uk", + "wikipedia-zero-shot.vi": "vi", + "wikipedia-zero-shot.zh": "zh", + "wikinews-zero-shot.ar": "ar", + "wikinews-zero-shot.cs": "cs", + "wikinews-zero-shot.de": "de", + "wikinews-zero-shot.en": "en", + "wikinews-zero-shot.es": "es", + "wikinews-zero-shot.fi": "fi", + "wikinews-zero-shot.fr": "fr", + "wikinews-zero-shot.it": "it", + "wikinews-zero-shot.ja": "ja", + "wikinews-zero-shot.ko": "ko", + "wikinews-zero-shot.nl": "nl", + "wikinews-zero-shot.no": "no", + "wikinews-zero-shot.pl": "pl", + "wikinews-zero-shot.pt": "pt", + "wikinews-zero-shot.ru": "ru", + "wikinews-zero-shot.sr": "sr", + "wikinews-zero-shot.sv": "sv", + "wikinews-zero-shot.ta": "ta", + # 'wikinews-zero-shot.tr': 'tr', + "wikinews-zero-shot.uk": "uk", + "wikinews-zero-shot.zh": "zh", + "wikinews-cross-domain.ar": "ar", + "wikinews-cross-domain.bg": "bg", + "wikinews-cross-domain.ca": "ca", + "wikinews-cross-domain.cs": "cs", + "wikinews-cross-domain.de": "de", + "wikinews-cross-domain.el": "el", + "wikinews-cross-domain.en": "en", + "wikinews-cross-domain.es": "es", + "wikinews-cross-domain.fi": "fi", + "wikinews-cross-domain.fr": "fr", + "wikinews-cross-domain.he": "he", + "wikinews-cross-domain.hu": "hu", + "wikinews-cross-domain.it": "it", + "wikinews-cross-domain.ja": "ja", + "wikinews-cross-domain.ko": "ko", + "wikinews-cross-domain.nl": "nl", + "wikinews-cross-domain.no": "no", + "wikinews-cross-domain.pl": "pl", + "wikinews-cross-domain.pt": "pt", + "wikinews-cross-domain.ro": "ro", + "wikinews-cross-domain.ru": "ru", + "wikinews-cross-domain.sr": "sr", + "wikinews-cross-domain.sv": "sv", + "wikinews-cross-domain.ta": "ta", + "wikinews-cross-domain.tr": "tr", + "wikinews-cross-domain.uk": "uk", + "wikinews-cross-domain.zh": "zh", + }, + "sbmaruf/forai_ml-ted_talk_iwslt": { + "eu_ca_2014": "eu_ca", + "eu_ca_2015": "eu_ca", + "eu_ca_2016": "eu_ca", + "nl_en_2014": "nl_en", + "nl_en_2015": "nl_en", + "nl_en_2016": "nl_en", + "nl_hi_2014": "nl_hi", + "nl_hi_2015": "nl_hi", + "nl_hi_2016": "nl_hi", + "de_ja_2014": "de_ja", + "de_ja_2015": "de_ja", + "de_ja_2016": "de_ja", + "fr-ca_hi_2014": "fr_hi", + "fr-ca_hi_2015": "fr_hi", + "fr-ca_hi_2016": "fr_hi", + }, + "sbmaruf/forai_ml_masakhane_mafand": { + "en-amh": "en-amh", + "en-hau": "en-hau", + "en-ibo": "en-ibo", + "en-kin": "en-kin", + "en-lug": "en-lug", + "en-nya": "en-nya", + "en-pcm": "en-pcm", + "en-sna": "en-sna", + "en-swa": "en-swa", + "en-tsn": "en-tsn", + "en-twi": "en-twi", + "en-xho": "en-xho", + "en-yor": "en-yor", + "en-zul": "en-zul", + "fr-bam": "fr-bam", + "fr-bbj": "fr-bbj", + "fr-ewe": "fr-ewe", + "fr-fon": "fr-fon", + "fr-mos": "fr-mos", + "fr-wol": "fr-wol", + }, + "exams": { + # 'alignments': 'mixed', + # 'multilingual': 'mixed', + # 'multilingual_with_para': 'mixed', + # 'crosslingual_test':'mixed', + # 'crosslingual_with_para_test': 'mixed', + "crosslingual_bg": "bg", + "crosslingual_with_para_bg": "bg", + "crosslingual_hr": "hr", + "crosslingual_with_para_hr": "hr", + "crosslingual_hu": "hu", + "crosslingual_with_para_hu": "hu", + "crosslingual_it": "it", + "crosslingual_with_para_it": "it", + "crosslingual_mk": "mk", + "crosslingual_with_para_mk": "mk", + "crosslingual_pl": "pl", + "crosslingual_with_para_pl": "pl", + "crosslingual_pt": "pt", + "crosslingual_with_para_pt": "pt", + "crosslingual_sq": "sq", + "crosslingual_with_para_sq": "sq", + "crosslingual_sr": "sr", + "crosslingual_with_para_sr": "sr", + "crosslingual_tr": "tr", + "crosslingual_with_para_tr": "tr", + "crosslingual_vi": "vi", + "crosslingual_with_para_vi": "vi", + }, + "allenai/soda": { + None: "en", + }, + "arabic_billion_words": { + # 'Alittihad': "ar", + "Almasryalyoum": "ar", + "Almustaqbal": "ar", + "Alqabas": "ar", + "Echoroukonline": "ar", + "Ryiadh": "ar", + "Sabanews": "ar", + "SaudiYoumSaudi": "ar", + "Techreen": "ar", + "Youm7": "ar", + }, + "theblackcat102/joke_explaination": { + None: "en", + }, + "narrativeqa": { + None: "en", + }, + "svakulenk0/qrecc": { + None: "en", + }, + "GEM/wiki_cat_sum": { + "animal": "en", + "company": "en", + "film": "en", + }, + "allenai/scirepeval": { + # "fos": "en", + # "mesh_descriptors": "en", + # "cite_count": "en", + # "pub_year": "en", + # "cite_prediction": "en", + # "cite_prediction_new": "en", + # "high_influence_cite": "en", + # "same_author": "en", + # "search": "en", + "biomimicry": "en", + # "drsm": "en", + # "feeds_1": "en", + # "feeds_m": "en", + # "feeds_title": "en", + # "peer_review_score_hIndex": "en", + # "trec_covid": "en", + # "tweet_mentions": "en", + # "scidocs_mag_mesh": "en", + # "scidocs_view_cite_read": "en", + # "paper_reviewer_matching": "en", + }, + "TurkuNLP/turku_paraphrase_corpus": { + "plain": "fi", + "plain-context": "fi", + "classification": "fi", + "classification-context": "fi", + "generation": "fi", + }, + "wiki_split": {None: "en"}, +} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset-names", + nargs="+", + default=None, + help="Print the stat of the dataset. If `None` it will print stat of all the used data.", + ) + parser.add_argument( + "--export-format", + choices=["json", "csv"], + default=".json", + help="Which format you want to export.", + ) + parser.add_argument( + "--output-dir", + default=None, + help="The path to the folder where stat will be saved.", + ) + args = parser.parse_args() + stat_dict = {} + if args.dataset_names is None: + args.dataset_names = list(SERIES_A_DATASET_NAME_DICT.keys()) + for dataset_name, subset_dict in SERIES_A_DATASET_NAME_DICT.items(): + if dataset_name not in args.dataset_names: + continue + assert dataset_name not in stat_dict + stat_dict[dataset_name] = {} + for subset, subset_lang in subset_dict.items(): + assert subset not in stat_dict[dataset_name] + stat_dict[dataset_name][subset] = {} + dt = datasets.load_dataset( + dataset_name, name=subset, verification_mode="no_checks" + ) + for split in dt.keys(): + stat_dict[dataset_name][subset][split] = { + "size": len(dt[split]), + "column": list(dt[split].column_names), + } + # re-valuation of hypothesis considered in prompt template + if subset is not None and "X-CSQA" in subset: + for sample in dt[split]: + assert len(sample["question"]["choices"]["label"]) == 5 + + # if args.output_dir != "None": + # file_name = os.path.join(args.output_dir, "stat") + f".{args.export_format}" + # if args.export_format == "json": + # with open(file_name, "w") as file_ptr: + # file_ptr.write(f"{json.dumps(stat_dict, indent=4)}\n") + # elif args.export_format == "csv": + # # with open(file_name, mode='w') as file_ptr: + # # writer = csv.writer(file_ptr) + # # for dataset_name, subset_name, in SERIES_A_DATASET_NAME_DICT.keys(): + # # row = [f"{dataset_name}"] + + # # writer.writerow(stat_dict.values()) + # pass + # else: + # raise NotImplementedError + + +if __name__ == "__main__": + main() diff --git a/data/validate_and_generate.py b/data/validate_and_generate.py new file mode 100644 index 0000000..0d99c89 --- /dev/null +++ b/data/validate_and_generate.py @@ -0,0 +1,573 @@ +import os +import csv +import copy +import json +import tqdm +import argparse +import datasets +import subprocess +from datetime import date +import concurrent.futures +from typing import Tuple, Optional, List +from promptsource.templates import Template +from data.data_stat import SERIES_A_DATASET_NAME_DICT + +datasets.logging.set_verbosity_error() + +# mT5 101 language mapper with it's native symbol mentioned in the paper. +MT5_LANGS_NAME_PAIR = [ + ("Afrikaans", "af"), + ("Albanian", "sq"), + ("Amharic", "am"), + ("Arabic", "ar"), + ("Armenian", "hy"), + ("Azerbaijani", "az"), + ("Basque", "eu"), + ("Belarusian", "be"), + ("Bengali", "bn"), + ("Bulgarian", "bg"), + ("Burmese", "my"), + ("Catalan", "ca"), + ("Cebuano", "ceb"), + ("Chichewa", "ny"), + ("Chinese", "zh"), + ("Chinese (Traditional)", "zh"), + ("Corsican", "co"), + ("Czech", "cs"), + ("Danish", "da"), + ("Dutch", "nl"), + ("English", "en"), + ("Esperanto", "eo"), + ("Estonian", "et"), + ("Filipino", "fil"), + ("Finnish", "fi"), + ("French", "fr"), + ("Galician", "gl"), + ("Georgian", "ka"), + ("German", "de"), + ("Greek", "el"), + ("Gujarati", "gu"), + ("Haitian Creole", "ht"), + ("Hausa", "ha"), + ("Hawaiian", "haw"), + ("Hebrew", "iw"), + ("Hindi", "hi"), + ("Hmong", "hmn"), + ("Hungarian", "hu"), + ("Icelandic", "is"), + ("Igbo", "ig"), + ("Indonesian", "id"), + ("Irish", "ga"), + ("Italian", "it"), + ("Japanese", "ja"), + ("Javanese", "jv"), + ("Kannada", "kn"), + ("Kazakh", "kk"), + ("Khmer", "km"), + ("Korean", "ko"), + ("Kurdish", "ku"), + ("Kyrgyz", "ky"), + ("Lao", "lo"), + ("Latin", "la"), + ("Latvian", "lv"), + ("Lithuanian", "lt"), + ("Luxembourgish", "lb"), + ("Macedonian", "mk"), + ("Malagasy", "mg"), + ("Malay", "ms"), + ("Malayalam", "ml"), + ("Maltese", "mt"), + ("Maori", "mi"), + ("Marathi", "mr"), + ("Mongolian", "mn"), + ("Nepali", "ne"), + ("Norwegian", "no"), + ("Pashto", "ps"), + ("Persian", "fa"), + ("Polish", "pl"), + ("Portuguese", "pt"), + ("Punjabi", "pa"), + ("Romanian", "ro"), + ("Russian", "ru"), + ("Samoan", "sm"), + ("Scottish Gaelic", "gd"), + ("Serbian", "sr"), + ("Shona", "sn"), + ("Sindhi", "sd"), + ("Sinhala", "si"), + ("Slovak", "sk"), + ("Slovenian", "sl"), + ("Somali", "so"), + ("Sotho", "st"), + ("Spanish", "es"), + ("Sundanese", "su"), + ("Swahili", "sw"), + ("Swedish", "sv"), + ("Tajik", "tg"), + ("Tamil", "ta"), + ("Telugu", "te"), + ("Thai", "th"), + ("Turkish", "tr"), + ("Ukrainian", "uk"), + ("Urdu", "ur"), + ("Uzbek", "uz"), + ("Vietnamese", "vi"), + ("Welsh", "cy"), + ("West Frisian", "fy"), + ("Xhosa", "xh"), + ("Yiddish", "yi"), + ("Yoruba", "yo"), + ("Zulu", "zu"), +] +MT5_LANGS_FULL_NAME_TO_ISO_NAME = { + full_name: iso_name for full_name, iso_name in MT5_LANGS_NAME_PAIR +} +MT5_2_NLLB_ONE_TO_ONE = json.load(open("data/MT5_2_NLLB_ONE_TO_ONE.json")) + +# dataset mapper from spreadsheet (downloaded by --form_path argument) to huggingface dataset signature. +DATASET_MAPPER = { + "AfriSenti-twitter-sentiment https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment": "shmuhammad/AfriSenti-twitter-sentiment", + "Joke-explanation https://huggingface.co/datasets/theblackcat102/joke_explaination": "theblackcat102/joke_explaination", + "Language Identification https://huggingface.co/datasets/papluca/language-identification": "papluca/language-identification", + "Mafand - a machine translation task https://huggingface.co/datasets/masakhane/mafand": "sbmaruf/forai_ml_masakhane_mafand", + "Masakhanews https://github.com/masakhane-io/masakhane-news": "masakhane/masakhanews", + "Mintaka https://huggingface.co/datasets/AmazonScience/mintaka": "AmazonScience/mintaka", + "NarrativeQA https://huggingface.co/datasets/narrativeqa": "narrativeqa", + "NusaX - sentiment classification https://huggingface.co/datasets/indonlp/NusaX-senti": "indonlp/NusaX-senti", + "qrecc https://huggingface.co/datasets/svakulenk0/qrecc": "svakulenk0/qrecc", + "SODA https://huggingface.co/datasets/allenai/soda": "allenai/soda", + "TED https://huggingface.co/datasets/ted_talks_iwslt": "sbmaruf/forai_ml-ted_talk_iwslt", + "WikiCatSum https://huggingface.co/datasets/GEM/wiki_cat_sum": "GEM/wiki_cat_sum", + "X-CSQA https://huggingface.co/datasets/xcsr": "xcsr", + "xlel_wd https://huggingface.co/datasets/adithya7/xlel_wd": "adithya7/xlel_wd", + "allenai/scirepeval/biomimicry https://huggingface.co/datasets/allenai/scirepeval/viewer/biomimicry/train": "allenai/scirepeval", + "Turku Paraphrase https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus": "TurkuNLP/turku_paraphrase_corpus", + "Xwikis https://huggingface.co/datasets/GEM/xwikis": "GEM/xwikis", +} + +# These tasks have huggingface data loading error +IGNORE_TASKS = ["arabic_billion_words", "narrativeqa", "svakulenk0/qrecc"] + + +def check( + json_example: str, + template_name: str, + jinja_template: str, + template_reference: Optional[str] = None, + original_task: Optional[str] = None, + choices_in_prompt: Optional[bool] = None, + metrics: Optional[List[str]] = None, + languages: Optional[List[str]] = None, + answer_choices: Optional[str] = None, +) -> Tuple[str, str]: + """ + Given an example (`json_example`) from a huggingface dataset and prompt template (`jinja_template`), + the objective is to project sample using `jinja_template`. It return a pair of string, inputs (str), targets(str). + Args: + json_example (str): a string contains json object. The json object is loaded + by `json.loads()`. Typically this is a sample from + huggingface dataset converted to a string by a `json.dumps()`. + template_name: unique name (per dataset) for template + jinja_template: template expressed in Jinja + template_reference: string describing author or paper reference for template + original_task: If True, this prompt asks a model to perform the original task designed for + this dataset. + choices_in_prompt: If True, the answer choices are included in the templates such that models + see those choices in the input. Only applicable to classification tasks. + metrics: List of strings denoting metrics to use for evaluation + languages: List of strings denoting languages used in the prompt (not the associated dataset!) + answer_choices: Jinja expression for answer choices. Should produce + a ||| delimited string of choices that enumerates + the possible completions for templates that should + be evaluated as ranked completions. If None, then + the template is open-ended. This list is accessible + from within Jinja as the variable `answer_choices`. + """ + json_example = json.loads(json_example) + metadata = Template.Metadata(original_task, choices_in_prompt, metrics, languages) + template = Template( + template_name, + jinja_template, + template_reference, + metadata=metadata, + answer_choices=answer_choices, + ) + lm_io = template.apply(json_example, highlight_variables=False, truncate=False) + return lm_io + + +def create_name_with_hierarchy( + output_dir, + dataset_signature, + dataset_subset, + split_name, + template_name, + template_lang, +): + """ + Returns a full path name for a prompt template with different folder hierarchy. Strictly fixed for pushing the data into the database. + _/__to__/template-generation/