diff --git a/scripts/data/common/killed-in-gaza/constants.ts b/scripts/data/common/killed-in-gaza/constants.ts new file mode 100644 index 00000000..3d7dd523 --- /dev/null +++ b/scripts/data/common/killed-in-gaza/constants.ts @@ -0,0 +1,3 @@ +// we've found that the RTL column order in the ar_ar dict csv isn't reliable in CI +// so if the key isn't in the resulting dict we either invert it or throw an error +export const arToArAssertKey = "ابو الليل"; diff --git a/scripts/data/common/killed-in-gaza/generate_killed_list.ts b/scripts/data/common/killed-in-gaza/generate_killed_list.ts index a8ba81d4..f8e2e3dc 100644 --- a/scripts/data/common/killed-in-gaza/generate_killed_list.ts +++ b/scripts/data/common/killed-in-gaza/generate_killed_list.ts @@ -1,41 +1,17 @@ -import fs from "fs"; import { ArabicClass } from "arabic-utils"; +import { readCsv, readCsvToDict, writeCsv } from "../../../utils/csv"; +import { arToArAssertKey } from "./constants"; const pwd = "scripts/data/common/killed-in-gaza"; const arRawNameColumnLabel = "name_ar_raw"; const arEnNameColumnLabel = "name_en"; -const readCsv = (repoPath: string) => { - const csvString = fs.readFileSync(repoPath).toString(); - return csvString.split(/\r?\n/g).map((row) => row.split(",")); -}; - -/** - * read a CSV file and return an object lookup ("dict") with keys - * as the first CSV column value, and values as the second CSV column - */ -const readCsvToDict = (repoPath: string) => { - return readCsv(repoPath).reduce( - (dict, row) => ({ - ...dict, - [row[0]]: row[1], - }), - {} as Record - ); -}; - const rawList = readCsv(`${pwd}/data/raw.csv`); -let arToAr = readCsvToDict(`${pwd}/data/dict_ar_ar.csv`); -const arToEn = readCsvToDict(`${pwd}/data/dict_ar_en.csv`); -// if this matches, our ar->ar dict was read backwards and we need to flip it -if (arToAr["ابوالليل"]) { - console.log("⚠️ inverting ar->ar which was read LTR"); - arToAr = Object.entries(arToAr).reduce( - (flipped, [key, value]) => ({ ...flipped, [value]: key }), - {} - ); -} +const arToAr = readCsvToDict(`${pwd}/data/dict_ar_ar.csv`, { + assertKey: arToArAssertKey, +}); +const arToEn = readCsvToDict(`${pwd}/data/dict_ar_en.csv`); const [rawHeaderRow, ...rawListRows] = rawList; const arRawColumn = rawHeaderRow.indexOf(arRawNameColumnLabel); @@ -74,10 +50,6 @@ const resultList = rawListRows.map((row) => { return [...row, replaceWholeNameSegments(normalizedArName, arToEn)]; }); -const toCsv = (list: string[][]) => list.map((row) => row.join(",")).join("\n"); +const newHeaders = [...rawHeaderRow, arEnNameColumnLabel]; -const newHeaders = [...rawHeaderRow, arEnNameColumnLabel].join(","); -fs.writeFileSync( - `${pwd}/output/result.csv`, - `${newHeaders}\n${toCsv(resultList)}` -); +writeCsv(`${pwd}/output/result.csv`, [newHeaders, ...resultList]); diff --git a/scripts/data/v2/derived/csv.ts b/scripts/data/v2/derived/csv.ts index f3b96925..51d6947e 100644 --- a/scripts/data/v2/derived/csv.ts +++ b/scripts/data/v2/derived/csv.ts @@ -1,4 +1,4 @@ -import { writeCsv, writeJson } from "../../../utils/fs"; +import { writeManifestCsv, writeJson } from "../../../utils/fs"; import { ApiResource } from "../../../../types/api.types"; import { KilledInGaza } from "../../../../types/killed-in-gaza.types"; @@ -20,7 +20,7 @@ const killedRows = killedPersons.reduce( }, [killedRowOrder.slice()] as string[][] ); -writeCsv( +writeManifestCsv( ApiResource.KilledInGazaV2, `${writePath}/killed-in-gaza.csv`, killedRows @@ -33,7 +33,7 @@ const dailyRows = dailies.reduce( }, [dailyRowOrder.slice()] as string[][] ); -writeCsv( +writeManifestCsv( ApiResource.CasualtiesDailyV2, `${writePath}/casualties_daily.csv`, dailyRows diff --git a/scripts/utils/csv.ts b/scripts/utils/csv.ts new file mode 100644 index 00000000..a84e7921 --- /dev/null +++ b/scripts/utils/csv.ts @@ -0,0 +1,50 @@ +import fs from "fs"; + +export const readCsv = (repoPath: string) => { + const csvString = fs.readFileSync(repoPath).toString(); + return csvString.split(/\r?\n/g).map((row) => row.split(",")); +}; + +/** + * read a CSV file and return an object lookup ("dict") with keys + * as the first CSV column value, and values as the second CSV column + * + * @param repoPath relative path from root of repo + * @param options optional object with assertion key to make sure key exists in resulting object + * if assertKey does not exist, the dict is inverted and if the assertKey still does not exist + * the method @throws + */ +export const readCsvToDict = ( + repoPath: string, + options: { assertKey?: string; invert?: boolean } = {} +): Record => { + const result = readCsv(repoPath).reduce( + (dict, row) => ({ + ...dict, + [row[options.invert ? 1 : 0]]: row[options.invert ? 0 : 1], + }), + {} as Record + ); + + if (options.assertKey && !options.invert && !result[options.assertKey]) { + console.log( + `could not find assertKey ${options.assertKey} in resulting dict, inverting` + ); + return readCsvToDict(repoPath, { ...options, invert: true }); + } + + if (options.assertKey && options.invert && !result[options.assertKey]) { + throw new Error( + `Expected dict to include key '${options.assertKey}' but it did not exist in the initial or inverted dict` + ); + } + + return result; +}; + +const toCsv = (rows: string[][]) => + rows.map((columns) => columns.join(",")).join("\r\n"); + +export const writeCsv = (repoPath: string, rows: string[][]) => { + fs.writeFileSync(repoPath, toCsv(rows)); +}; diff --git a/scripts/utils/fs.ts b/scripts/utils/fs.ts index 01f2c961..d9688396 100644 --- a/scripts/utils/fs.ts +++ b/scripts/utils/fs.ts @@ -26,7 +26,7 @@ export const writeJson = ( addToManifest(resource, { minified, unminified: unminifiedFileName }); }; -export const writeCsv = ( +export const writeManifestCsv = ( resource: ApiResource, filePath: string, rows: any[][] diff --git a/scripts/utils/sort-csv.ts b/scripts/utils/sort-csv.ts index c6ffebfa..2ac976f6 100644 --- a/scripts/utils/sort-csv.ts +++ b/scripts/utils/sort-csv.ts @@ -1,5 +1,7 @@ import { ArabicClass } from "arabic-utils"; import fs from "fs"; +import { readCsvToDict } from "./csv"; +import { arToArAssertKey } from "../data/common/killed-in-gaza/constants"; const headerRow = "original,cleaned"; @@ -18,17 +20,21 @@ const rowTransformerForDictResultType = { // for ar_ar dict, we maintain spaces in key to allow for segment consolidation // we normalize the value since that will be used to lookup against normalized // ar values in the en_en dict - ar: (row: string) => { - const [arKey, cleanedValue] = row.split(","); + ar: ([arKey, cleanedValue]: [string, string]) => { trackDuplicateKeysForLogging(arKey); const normalizedValue = new ArabicClass(cleanedValue.trim()).normalize(); - return [arKey, normalizedValue].join(","); + const arRow = [arKey, normalizedValue].join(","); + if (arKey === arToArAssertKey && arRow.endsWith(arToArAssertKey)) { + throw new Error( + `sort-csv expected arKey '${arKey}' in RTL but resulting row was inverted: ${arRow}` + ); + } + return arRow; }, - // for ar_en we split on potential spaces around the comma to ensure no spaces in - // the key column lead to mismatched lookups, and we normalize the AR key per above - en: (row: string) => { - const [arKey, cleanedValue] = row.split(/\s*,\s*/); + // for EN we trim leading and ending spaces from each column value and normalize + // the arabic key column value so that it can match the value column in the ar dict + en: ([arKey, cleanedValue]: [string, string]) => { const normalizedArKey = new ArabicClass(arKey.trim()).normalize(); trackDuplicateKeysForLogging(normalizedArKey); return [normalizedArKey, cleanedValue.trim().toLowerCase()].join(","); @@ -54,10 +60,12 @@ const sortForType = (resultType: "ar" | "en", list: string[]) => { }; const sortCsv = (repoFilePath: string, resultType: "ar" | "en") => { - const csv = fs.readFileSync(repoFilePath).toString(); + const csvDict = readCsvToDict( + repoFilePath, + resultType === "ar" ? { assertKey: arToArAssertKey } : {} + ); - const cleanedRows = csv - .split("\n") + const cleanedRows = Object.entries(csvDict) .map(rowTransformerForDictResultType[resultType]) .filter((row) => !!row);