remove nan values

ARBML · Dec 21, 2024 · c861093 · c861093
1 parent dd11107
commit c861093
Show file tree

Hide file tree

Showing 709 changed files with 4,857 additions and 4,857 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ vendor
 .pytest_cache
 .hypothesis
 Gemfile.lock
+save_jsons.py
diff --git a/assets/js/search.js b/assets/js/search.js
@@ -22,12 +22,12 @@ function accessBadge(text) {
 function setAttributes(attribute, element) {
 
   if (attribute == "Cost") {
-    if (element[attribute] != "nan") return element[attribute]
+    if (element[attribute] != "") return element[attribute]
     else return "0$"
   } else if (attribute == "Access") {
     return accessBadge(element[attribute])
   }
-  else if (element[attribute] != "nan") {
+  else if (element[attribute] != "") {
     return element[attribute]
   }
 

diff --git a/data.json b/data.json
diff --git a/datasets/101_billion_arabic_words_dataset.json b/datasets/101_billion_arabic_words_dataset.json
@@ -22,13 +22,13 @@
     "Tokenized": "No",
     "Host": "HuggingFace",
     "Access": "Free",
-    "Cost": "nan",
+    "Cost": "",
     "Test Split": "No",
     "Tasks": "text generation, language modeling",
     "Venue Title": "arXiv",
-    "Citations": "nan",
+    "Citations": "",
     "Venue Type": "preprint",
-    "Venue Name": "nan",
+    "Venue Name": "",
     "Authors": "Manel Aloui, Hasna Chouikhi, Ghaith Chaabane, Haithem Kchaou, and Chehir Dhaouadi",
     "Affiliations": "Clusterlab",
     "Abstract": "In recent years, Large Language Models (LLMs) have revolutionized the field of natural language processing, showcasing an impressive rise predominantly in English-centric domains. These advancements have set a global benchmark, inspiring significant efforts toward developing Arabic LLMs capable of understanding and generating the Arabic language with remarkable accuracy. Despite these advancements, a critical challenge persists: the potential bias in Arabic LLMs, primarily attributed to their reliance on datasets comprising English data that has been translated into Arabic. This reliance not only compromises the authenticity of the generated content but also reflects a broader issue\u2014the scarcity of original quality Arabic linguistic data. This study aims to address the data scarcity in the Arab world and to encourage the development of Arabic Language Models that are true to both the linguistic and nuances of the region. We undertook a large-scale data mining project, extracting a substantial volume of text from the Common Crawl WET files, specifically targeting Arabic content. The extracted data underwent a rigorous cleaning and deduplication process, using innovative techniques to ensure the integrity and uniqueness of the dataset. The result is the 101 Billion Arabic Words Dataset, the largest Arabic dataset available to date, which can significantly contribute to the development of authentic Arabic LLMs. This study not only highlights the potential for creating linguistically and culturally accurate Arabic LLMs but also sets a precedent for future research in enhancing the authenticity of Arabic language models.",

diff --git a/datasets/1993-2007_united_nations_parallel_text.json b/datasets/1993-2007_united_nations_parallel_text.json
@@ -1,7 +1,7 @@
 {
     "Name": "1993-2007 United Nations Parallel Text",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2013T06",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2013,
@@ -15,22 +15,22 @@
     "Unit": "documents",
     "Ethical Risks": "Low",
     "Provider": "LDC",
-    "Derived From": "nan",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
+    "Derived From": "",
+    "Paper Title": "",
+    "Paper Link": "",
     "Script": "Arab",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "With-Fee",
     "Cost": "175.00 $",
     "Test Split": "No",
     "Tasks": "machine translation",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }
diff --git a/datasets/1997_hub5_arabic_evaluation.json b/datasets/1997_hub5_arabic_evaluation.json
@@ -1,7 +1,7 @@
 {
     "Name": "1997 HUB5 Arabic Evaluation",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2002S22",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2002,
@@ -15,22 +15,22 @@
     "Unit": "documents",
     "Ethical Risks": "Low",
     "Provider": "LDC",
-    "Derived From": "nan",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
-    "Script": "nan",
+    "Derived From": "",
+    "Paper Title": "",
+    "Paper Link": "",
+    "Script": "",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "With-Fee",
     "Cost": "1,500.00 $",
     "Test Split": "No",
     "Tasks": "speech recognition",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }
diff --git a/datasets/1997_hub5_arabic_transcripts.json b/datasets/1997_hub5_arabic_transcripts.json
@@ -1,7 +1,7 @@
 {
     "Name": "1997 HUB5 Arabic Transcripts",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2002T39",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2002,
@@ -15,22 +15,22 @@
     "Unit": "documents",
     "Ethical Risks": "Low",
     "Provider": "LDC",
-    "Derived From": "nan",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
-    "Script": "nan",
+    "Derived From": "",
+    "Paper Title": "",
+    "Paper Link": "",
+    "Script": "",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "With-Fee",
     "Cost": "500.00 $",
     "Test Split": "No",
     "Tasks": "speech recognition",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }
diff --git a/datasets/2003_nist_language_recognition_evaluation.json b/datasets/2003_nist_language_recognition_evaluation.json
@@ -1,7 +1,7 @@
 {
     "Name": "2003 NIST Language Recognition Evaluation",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2006S31",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2006,
@@ -15,22 +15,22 @@
     "Unit": "hours",
     "Ethical Risks": "Low",
     "Provider": "LDC",
-    "Derived From": "nan",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
+    "Derived From": "",
+    "Paper Title": "",
+    "Paper Link": "",
     "Script": "Arab",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "With-Fee",
     "Cost": "500.00 $",
     "Test Split": "No",
     "Tasks": "language identification",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }
diff --git a/datasets/2003_nist_rich_transcription_evaluation_data.json b/datasets/2003_nist_rich_transcription_evaluation_data.json
@@ -1,7 +1,7 @@
 {
     "Name": "2003 NIST Rich Transcription Evaluation Data",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2007S10",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2007,
@@ -15,22 +15,22 @@
     "Unit": "hours",
     "Ethical Risks": "Low",
     "Provider": "LDC",
-    "Derived From": "nan",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
+    "Derived From": "",
+    "Paper Title": "",
+    "Paper Link": "",
     "Script": "Arab",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "With-Fee",
     "Cost": "2,000.00 $",
     "Test Split": "No",
     "Tasks": "speech recognition",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }
diff --git a/datasets/2005_nist_speaker_recognition_evaluation_test_data.json b/datasets/2005_nist_speaker_recognition_evaluation_test_data.json
@@ -1,7 +1,7 @@
 {
     "Name": "2005 NIST Speaker Recognition Evaluation Test Data",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2011S04",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2011,
@@ -15,22 +15,22 @@
     "Unit": "hours",
     "Ethical Risks": "Low",
     "Provider": "LDC",
-    "Derived From": "nan",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
+    "Derived From": "",
+    "Paper Title": "",
+    "Paper Link": "",
     "Script": "Arab",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "With-Fee",
     "Cost": "400.00 $",
     "Test Split": "No",
     "Tasks": "speaker identification",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }
diff --git a/datasets/2005_nist_speaker_recognition_evaluation_training_data.json b/datasets/2005_nist_speaker_recognition_evaluation_training_data.json
@@ -1,7 +1,7 @@
 {
     "Name": "2005 NIST Speaker Recognition Evaluation Training Data",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2011S01",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2011,
@@ -15,22 +15,22 @@
     "Unit": "hours",
     "Ethical Risks": "Low",
     "Provider": "LDC",
-    "Derived From": "nan",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
+    "Derived From": "",
+    "Paper Title": "",
+    "Paper Link": "",
     "Script": "Arab",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "With-Fee",
     "Cost": "350.00 $",
     "Test Split": "No",
     "Tasks": "speaker identification",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }
diff --git a/datasets/2006_conll_shared_task_-_arabic_&_czech.json b/datasets/2006_conll_shared_task_-_arabic_&_czech.json
@@ -1,7 +1,7 @@
 {
     "Name": "2006 CoNLL Shared Task - Arabic & Czech",
     "Subsets": [],
-    "HF Link": "nan",
+    "HF Link": "",
     "Link": "https://catalog.ldc.upenn.edu/LDC2015T12",
     "License": "LDC User Agreement for Non-Members",
     "Year": 2006,
@@ -11,26 +11,26 @@
     "Form": "text",
     "Collection Style": "other",
     "Description": "2006 CoNLL Shared Task - Arabic & Czech consists of Arabic and Czech dependency treebanks used as part of the CoNLL 2006 shared task on multi-lingual dependency parsing.",
-    "Volume": "nan",
+    "Volume": "",
     "Unit": "tokens",
     "Ethical Risks": "Low",
     "Provider": "LDC",
     "Derived From": "PADT",
-    "Paper Title": "nan",
-    "Paper Link": "nan",
+    "Paper Title": "",
+    "Paper Link": "",
     "Script": "Arab-Latn",
     "Tokenized": "No",
     "Host": "LDC",
     "Access": "Upon-Request",
-    "Cost": "nan",
+    "Cost": "",
     "Test Split": "No",
     "Tasks": "syntactic parsing",
-    "Venue Title": "nan",
-    "Citations": "nan",
-    "Venue Type": "nan",
-    "Venue Name": "nan",
-    "Authors": "nan",
-    "Affiliations": "nan",
-    "Abstract": "nan",
+    "Venue Title": "",
+    "Citations": "",
+    "Venue Type": "",
+    "Venue Name": "",
+    "Authors": "",
+    "Affiliations": "",
+    "Abstract": "",
     "Added By": "Zaid Alyafeai"
 }