Skip to content

Commit

Permalink
update sort script to rewrite w/ normalized arabic
Browse files Browse the repository at this point in the history
  • Loading branch information
sterlingwes committed Feb 11, 2024
1 parent 9439757 commit bfee7e1
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 22 deletions.
14 changes: 7 additions & 7 deletions scripts/data/common/killed-in-gaza/data/dict_ar_en.csv
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ ar,en
أبوشماله,Aboshmalah
أبولبن,Abolbn
أبومحيسن,Abomuhaisan
أُبي,Abi
أبي,Abi
أثار,Athar
أثال,Athal
أحلام,Ahlam
Expand Down Expand Up @@ -1514,7 +1514,7 @@ ar,en
ايمان,eman
ايمن,ayman
ايناس,Ainas
اّيه,Aّiah
ايه,Aّiah
ايه,ayah
ايهاب,ihab
ايهم,Aihm
Expand Down Expand Up @@ -2094,7 +2094,7 @@ ar,en
رياطي,Raiatai
ريام,Raiam
ريان,Raian
ريّان,Riّan
ريان,Riّan
ريتا,rita
ريتاج,Ritaj
ريتال,Rital
Expand Down Expand Up @@ -2612,7 +2612,7 @@ ar,en
عبير,abeer
عتيق,atiq
عثمان,Athman
عُثمان,Aُthman
عثمان,Aُthman
عجور,Ajoar
عدس,Ads
عدله,Adlah
Expand Down Expand Up @@ -2685,7 +2685,7 @@ ar,en
عكيلة,Akilah
عكيله,Akilah
علا,Alaa
عُلا,Ula
علا,Ula
علاء,Alaaa
علاءالدين,AlaaaAl-Deen
علام,Alaam
Expand Down Expand Up @@ -3083,7 +3083,7 @@ ar,en
محضيه,Muhdhiah
محفوظ,Mahfoz
محمد,muhammad
مُحمد,Muhammad
محمد,Muhammad
محمدد,Muhamdd
محمدرياض,Muhamdraiadh
محمدعوض,Muhamduaodh
Expand Down Expand Up @@ -3514,7 +3514,7 @@ ar,en
يسر,Yosr
يسرى,Ysura
يسري,Ysri
ُيسري,Ysri
يسري,Ysri
يعقوب,Yaqob
يقين,Yqain
يمامه,Yamamuah
Expand Down
35 changes: 20 additions & 15 deletions scripts/utils/sort-csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ const headerRow = "original,cleaned";
const sortCsv = (repoFilePath: string) => {
const csv = fs.readFileSync(repoFilePath).toString();

const uniqueArParts = new Set<string>();
const duplicates = new Set<string>();

const sortedRows = csv
.split("\n")
.sort((aRaw, bRaw) => {
Expand All @@ -21,21 +24,19 @@ const sortCsv = (repoFilePath: string) => {
const b = new ArabicClass(bRaw).normalize();
return a.localeCompare(b);
})
.map((row) => row.replace(/\s*,\s*/, ",").trim())
.filter((row) => !!row);
.map((row) => {
const [arKey, cleanedValue] = row.split(/\s*,\s*/);
const normalizedArKey = new ArabicClass(arKey.trim()).normalize();

const uniqueArParts = new Set<string>();
const duplicates = new Set<string>();
sortedRows.forEach((row) => {
const [arRaw] = row.split(",");
const ar = new ArabicClass(arRaw).normalize();
if (uniqueArParts.has(normalizedArKey)) {
duplicates.add(normalizedArKey);
} else {
uniqueArParts.add(normalizedArKey);
}

if (uniqueArParts.has(ar)) {
duplicates.add(ar);
} else {
uniqueArParts.add(ar);
}
});
return [normalizedArKey, cleanedValue.trim()].join(",");
})
.filter((row) => !!row);

console.log(
`${filePath} sorted alphabetically by arabic name column (${uniqueArParts.size} names)`
Expand All @@ -51,8 +52,12 @@ const sortCsv = (repoFilePath: string) => {
};

const filePath = process.argv.slice().pop();
if (typeof filePath !== "string" || filePath.endsWith("sort-csv.ts")) {
console.log("requires a repo file path argument");
if (
typeof filePath !== "string" ||
filePath.endsWith("sort-csv.ts") ||
filePath.includes("ar_") === false
) {
console.log("requires a repo file path argument for ar_* dict csv");
process.exit(1);
}

Expand Down

0 comments on commit bfee7e1

Please sign in to comment.