Skip to content

Commit

Permalink
remove nan values
Browse files Browse the repository at this point in the history
  • Loading branch information
zaidalyafeai committed Dec 21, 2024
1 parent dd11107 commit c861093
Show file tree
Hide file tree
Showing 709 changed files with 4,857 additions and 4,857 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ vendor
.pytest_cache
.hypothesis
Gemfile.lock
save_jsons.py
4 changes: 2 additions & 2 deletions assets/js/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ function accessBadge(text) {
function setAttributes(attribute, element) {

if (attribute == "Cost") {
if (element[attribute] != "nan") return element[attribute]
if (element[attribute] != "") return element[attribute]
else return "0$"
} else if (attribute == "Access") {
return accessBadge(element[attribute])
}
else if (element[attribute] != "nan") {
else if (element[attribute] != "") {
return element[attribute]
}

Expand Down
1 change: 0 additions & 1 deletion data.json

This file was deleted.

6 changes: 3 additions & 3 deletions datasets/101_billion_arabic_words_dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
"Tokenized": "No",
"Host": "HuggingFace",
"Access": "Free",
"Cost": "nan",
"Cost": "",
"Test Split": "No",
"Tasks": "text generation, language modeling",
"Venue Title": "arXiv",
"Citations": "nan",
"Citations": "",
"Venue Type": "preprint",
"Venue Name": "nan",
"Venue Name": "",
"Authors": "Manel Aloui, Hasna Chouikhi, Ghaith Chaabane, Haithem Kchaou, and Chehir Dhaouadi",
"Affiliations": "Clusterlab",
"Abstract": "In recent years, Large Language Models (LLMs) have revolutionized the field of natural language processing, showcasing an impressive rise predominantly in English-centric domains. These advancements have set a global benchmark, inspiring significant efforts toward developing Arabic LLMs capable of understanding and generating the Arabic language with remarkable accuracy. Despite these advancements, a critical challenge persists: the potential bias in Arabic LLMs, primarily attributed to their reliance on datasets comprising English data that has been translated into Arabic. This reliance not only compromises the authenticity of the generated content but also reflects a broader issue\u2014the scarcity of original quality Arabic linguistic data. This study aims to address the data scarcity in the Arab world and to encourage the development of Arabic Language Models that are true to both the linguistic and nuances of the region. We undertook a large-scale data mining project, extracting a substantial volume of text from the Common Crawl WET files, specifically targeting Arabic content. The extracted data underwent a rigorous cleaning and deduplication process, using innovative techniques to ensure the integrity and uniqueness of the dataset. The result is the 101 Billion Arabic Words Dataset, the largest Arabic dataset available to date, which can significantly contribute to the development of authentic Arabic LLMs. This study not only highlights the potential for creating linguistically and culturally accurate Arabic LLMs but also sets a precedent for future research in enhancing the authenticity of Arabic language models.",
Expand Down
22 changes: 11 additions & 11 deletions datasets/1993-2007_united_nations_parallel_text.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "1993-2007 United Nations Parallel Text",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2013T06",
"License": "LDC User Agreement for Non-Members",
"Year": 2013,
Expand All @@ -15,22 +15,22 @@
"Unit": "documents",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "nan",
"Paper Title": "nan",
"Paper Link": "nan",
"Derived From": "",
"Paper Title": "",
"Paper Link": "",
"Script": "Arab",
"Tokenized": "No",
"Host": "LDC",
"Access": "With-Fee",
"Cost": "175.00 $",
"Test Split": "No",
"Tasks": "machine translation",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
24 changes: 12 additions & 12 deletions datasets/1997_hub5_arabic_evaluation.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "1997 HUB5 Arabic Evaluation",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2002S22",
"License": "LDC User Agreement for Non-Members",
"Year": 2002,
Expand All @@ -15,22 +15,22 @@
"Unit": "documents",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "nan",
"Paper Title": "nan",
"Paper Link": "nan",
"Script": "nan",
"Derived From": "",
"Paper Title": "",
"Paper Link": "",
"Script": "",
"Tokenized": "No",
"Host": "LDC",
"Access": "With-Fee",
"Cost": "1,500.00 $",
"Test Split": "No",
"Tasks": "speech recognition",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
24 changes: 12 additions & 12 deletions datasets/1997_hub5_arabic_transcripts.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "1997 HUB5 Arabic Transcripts",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2002T39",
"License": "LDC User Agreement for Non-Members",
"Year": 2002,
Expand All @@ -15,22 +15,22 @@
"Unit": "documents",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "nan",
"Paper Title": "nan",
"Paper Link": "nan",
"Script": "nan",
"Derived From": "",
"Paper Title": "",
"Paper Link": "",
"Script": "",
"Tokenized": "No",
"Host": "LDC",
"Access": "With-Fee",
"Cost": "500.00 $",
"Test Split": "No",
"Tasks": "speech recognition",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
22 changes: 11 additions & 11 deletions datasets/2003_nist_language_recognition_evaluation.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "2003 NIST Language Recognition Evaluation",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2006S31",
"License": "LDC User Agreement for Non-Members",
"Year": 2006,
Expand All @@ -15,22 +15,22 @@
"Unit": "hours",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "nan",
"Paper Title": "nan",
"Paper Link": "nan",
"Derived From": "",
"Paper Title": "",
"Paper Link": "",
"Script": "Arab",
"Tokenized": "No",
"Host": "LDC",
"Access": "With-Fee",
"Cost": "500.00 $",
"Test Split": "No",
"Tasks": "language identification",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
22 changes: 11 additions & 11 deletions datasets/2003_nist_rich_transcription_evaluation_data.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "2003 NIST Rich Transcription Evaluation Data",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2007S10",
"License": "LDC User Agreement for Non-Members",
"Year": 2007,
Expand All @@ -15,22 +15,22 @@
"Unit": "hours",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "nan",
"Paper Title": "nan",
"Paper Link": "nan",
"Derived From": "",
"Paper Title": "",
"Paper Link": "",
"Script": "Arab",
"Tokenized": "No",
"Host": "LDC",
"Access": "With-Fee",
"Cost": "2,000.00 $",
"Test Split": "No",
"Tasks": "speech recognition",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
22 changes: 11 additions & 11 deletions datasets/2005_nist_speaker_recognition_evaluation_test_data.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "2005 NIST Speaker Recognition Evaluation Test Data",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2011S04",
"License": "LDC User Agreement for Non-Members",
"Year": 2011,
Expand All @@ -15,22 +15,22 @@
"Unit": "hours",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "nan",
"Paper Title": "nan",
"Paper Link": "nan",
"Derived From": "",
"Paper Title": "",
"Paper Link": "",
"Script": "Arab",
"Tokenized": "No",
"Host": "LDC",
"Access": "With-Fee",
"Cost": "400.00 $",
"Test Split": "No",
"Tasks": "speaker identification",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "2005 NIST Speaker Recognition Evaluation Training Data",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2011S01",
"License": "LDC User Agreement for Non-Members",
"Year": 2011,
Expand All @@ -15,22 +15,22 @@
"Unit": "hours",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "nan",
"Paper Title": "nan",
"Paper Link": "nan",
"Derived From": "",
"Paper Title": "",
"Paper Link": "",
"Script": "Arab",
"Tokenized": "No",
"Host": "LDC",
"Access": "With-Fee",
"Cost": "350.00 $",
"Test Split": "No",
"Tasks": "speaker identification",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
24 changes: 12 additions & 12 deletions datasets/2006_conll_shared_task_-_arabic_&_czech.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"Name": "2006 CoNLL Shared Task - Arabic & Czech",
"Subsets": [],
"HF Link": "nan",
"HF Link": "",
"Link": "https://catalog.ldc.upenn.edu/LDC2015T12",
"License": "LDC User Agreement for Non-Members",
"Year": 2006,
Expand All @@ -11,26 +11,26 @@
"Form": "text",
"Collection Style": "other",
"Description": "2006 CoNLL Shared Task - Arabic & Czech consists of Arabic and Czech dependency treebanks used as part of the CoNLL 2006 shared task on multi-lingual dependency parsing.",
"Volume": "nan",
"Volume": "",
"Unit": "tokens",
"Ethical Risks": "Low",
"Provider": "LDC",
"Derived From": "PADT",
"Paper Title": "nan",
"Paper Link": "nan",
"Paper Title": "",
"Paper Link": "",
"Script": "Arab-Latn",
"Tokenized": "No",
"Host": "LDC",
"Access": "Upon-Request",
"Cost": "nan",
"Cost": "",
"Test Split": "No",
"Tasks": "syntactic parsing",
"Venue Title": "nan",
"Citations": "nan",
"Venue Type": "nan",
"Venue Name": "nan",
"Authors": "nan",
"Affiliations": "nan",
"Abstract": "nan",
"Venue Title": "",
"Citations": "",
"Venue Type": "",
"Venue Name": "",
"Authors": "",
"Affiliations": "",
"Abstract": "",
"Added By": "Zaid Alyafeai"
}
Loading

0 comments on commit c861093

Please sign in to comment.