Skip to content

Commit

Permalink
share csv logic & assert RTL dict for ar->ar
Browse files Browse the repository at this point in the history
  • Loading branch information
sterlingwes committed Feb 12, 2024
1 parent baf6cf0 commit 880c77b
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 50 deletions.
3 changes: 3 additions & 0 deletions scripts/data/common/killed-in-gaza/constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// we've found that the RTL column order in the ar_ar dict csv isn't reliable in CI
// so if the key isn't in the resulting dict we either invert it or throw an error
export const arToArAssertKey = "ابو الليل";
44 changes: 8 additions & 36 deletions scripts/data/common/killed-in-gaza/generate_killed_list.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,17 @@
import fs from "fs";
import { ArabicClass } from "arabic-utils";
import { readCsv, readCsvToDict, writeCsv } from "../../../utils/csv";
import { arToArAssertKey } from "./constants";

const pwd = "scripts/data/common/killed-in-gaza";
const arRawNameColumnLabel = "name_ar_raw";
const arEnNameColumnLabel = "name_en";

const readCsv = (repoPath: string) => {
const csvString = fs.readFileSync(repoPath).toString();
return csvString.split(/\r?\n/g).map((row) => row.split(","));
};

/**
* read a CSV file and return an object lookup ("dict") with keys
* as the first CSV column value, and values as the second CSV column
*/
const readCsvToDict = (repoPath: string) => {
return readCsv(repoPath).reduce(
(dict, row) => ({
...dict,
[row[0]]: row[1],
}),
{} as Record<string, string>
);
};

const rawList = readCsv(`${pwd}/data/raw.csv`);
let arToAr = readCsvToDict(`${pwd}/data/dict_ar_ar.csv`);
const arToEn = readCsvToDict(`${pwd}/data/dict_ar_en.csv`);

// if this matches, our ar->ar dict was read backwards and we need to flip it
if (arToAr["ابوالليل"]) {
console.log("⚠️ inverting ar->ar which was read LTR");
arToAr = Object.entries(arToAr).reduce(
(flipped, [key, value]) => ({ ...flipped, [value]: key }),
{}
);
}
const arToAr = readCsvToDict(`${pwd}/data/dict_ar_ar.csv`, {
assertKey: arToArAssertKey,
});
const arToEn = readCsvToDict(`${pwd}/data/dict_ar_en.csv`);

const [rawHeaderRow, ...rawListRows] = rawList;
const arRawColumn = rawHeaderRow.indexOf(arRawNameColumnLabel);
Expand Down Expand Up @@ -74,10 +50,6 @@ const resultList = rawListRows.map((row) => {
return [...row, replaceWholeNameSegments(normalizedArName, arToEn)];
});

const toCsv = (list: string[][]) => list.map((row) => row.join(",")).join("\n");
const newHeaders = [...rawHeaderRow, arEnNameColumnLabel];

const newHeaders = [...rawHeaderRow, arEnNameColumnLabel].join(",");
fs.writeFileSync(
`${pwd}/output/result.csv`,
`${newHeaders}\n${toCsv(resultList)}`
);
writeCsv(`${pwd}/output/result.csv`, [newHeaders, ...resultList]);
6 changes: 3 additions & 3 deletions scripts/data/v2/derived/csv.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { writeCsv, writeJson } from "../../../utils/fs";
import { writeManifestCsv, writeJson } from "../../../utils/fs";
import { ApiResource } from "../../../../types/api.types";
import { KilledInGaza } from "../../../../types/killed-in-gaza.types";

Expand All @@ -20,7 +20,7 @@ const killedRows = killedPersons.reduce(
},
[killedRowOrder.slice()] as string[][]
);
writeCsv(
writeManifestCsv(
ApiResource.KilledInGazaV2,
`${writePath}/killed-in-gaza.csv`,
killedRows
Expand All @@ -33,7 +33,7 @@ const dailyRows = dailies.reduce(
},
[dailyRowOrder.slice()] as string[][]
);
writeCsv(
writeManifestCsv(
ApiResource.CasualtiesDailyV2,
`${writePath}/casualties_daily.csv`,
dailyRows
Expand Down
50 changes: 50 additions & 0 deletions scripts/utils/csv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import fs from "fs";

export const readCsv = (repoPath: string) => {
const csvString = fs.readFileSync(repoPath).toString();
return csvString.split(/\r?\n/g).map((row) => row.split(","));
};

/**
* read a CSV file and return an object lookup ("dict") with keys
* as the first CSV column value, and values as the second CSV column
*
* @param repoPath relative path from root of repo
* @param options optional object with assertion key to make sure key exists in resulting object
* if assertKey does not exist, the dict is inverted and if the assertKey still does not exist
* the method @throws
*/
export const readCsvToDict = (
repoPath: string,
options: { assertKey?: string; invert?: boolean } = {}
): Record<string, string> => {
const result = readCsv(repoPath).reduce(
(dict, row) => ({
...dict,
[row[options.invert ? 1 : 0]]: row[options.invert ? 0 : 1],
}),
{} as Record<string, string>
);

if (options.assertKey && !options.invert && !result[options.assertKey]) {
console.log(
`could not find assertKey ${options.assertKey} in resulting dict, inverting`
);
return readCsvToDict(repoPath, { ...options, invert: true });
}

if (options.assertKey && options.invert && !result[options.assertKey]) {
throw new Error(
`Expected dict to include key '${options.assertKey}' but it did not exist in the initial or inverted dict`
);
}

return result;
};

const toCsv = (rows: string[][]) =>
rows.map((columns) => columns.join(",")).join("\r\n");

export const writeCsv = (repoPath: string, rows: string[][]) => {
fs.writeFileSync(repoPath, toCsv(rows));
};
2 changes: 1 addition & 1 deletion scripts/utils/fs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export const writeJson = (
addToManifest(resource, { minified, unminified: unminifiedFileName });
};

export const writeCsv = (
export const writeManifestCsv = (
resource: ApiResource,
filePath: string,
rows: any[][]
Expand Down
28 changes: 18 additions & 10 deletions scripts/utils/sort-csv.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { ArabicClass } from "arabic-utils";
import fs from "fs";
import { readCsvToDict } from "./csv";
import { arToArAssertKey } from "../data/common/killed-in-gaza/constants";

const headerRow = "original,cleaned";

Expand All @@ -18,17 +20,21 @@ const rowTransformerForDictResultType = {
// for ar_ar dict, we maintain spaces in key to allow for segment consolidation
// we normalize the value since that will be used to lookup against normalized
// ar values in the en_en dict
ar: (row: string) => {
const [arKey, cleanedValue] = row.split(",");
ar: ([arKey, cleanedValue]: [string, string]) => {
trackDuplicateKeysForLogging(arKey);
const normalizedValue = new ArabicClass(cleanedValue.trim()).normalize();
return [arKey, normalizedValue].join(",");
const arRow = [arKey, normalizedValue].join(",");
if (arKey === arToArAssertKey && arRow.endsWith(arToArAssertKey)) {
throw new Error(
`sort-csv expected arKey '${arKey}' in RTL but resulting row was inverted: ${arRow}`
);
}
return arRow;
},

// for ar_en we split on potential spaces around the comma to ensure no spaces in
// the key column lead to mismatched lookups, and we normalize the AR key per above
en: (row: string) => {
const [arKey, cleanedValue] = row.split(/\s*,\s*/);
// for EN we trim leading and ending spaces from each column value and normalize
// the arabic key column value so that it can match the value column in the ar dict
en: ([arKey, cleanedValue]: [string, string]) => {
const normalizedArKey = new ArabicClass(arKey.trim()).normalize();
trackDuplicateKeysForLogging(normalizedArKey);
return [normalizedArKey, cleanedValue.trim().toLowerCase()].join(",");
Expand All @@ -54,10 +60,12 @@ const sortForType = (resultType: "ar" | "en", list: string[]) => {
};

const sortCsv = (repoFilePath: string, resultType: "ar" | "en") => {
const csv = fs.readFileSync(repoFilePath).toString();
const csvDict = readCsvToDict(
repoFilePath,
resultType === "ar" ? { assertKey: arToArAssertKey } : {}
);

const cleanedRows = csv
.split("\n")
const cleanedRows = Object.entries(csvDict)
.map(rowTransformerForDictResultType[resultType])
.filter((row) => !!row);

Expand Down

0 comments on commit 880c77b

Please sign in to comment.