Merge pull request #15 from sillsdev/readmes

Readmes and reorganization
sillsdev · Oct 11, 2024 · 1603085 · 1603085
2 parents d6c6aa0 + 1f8554a
commit 1603085
Show file tree

Hide file tree

Showing 14 changed files with 303 additions and 234 deletions.
diff --git a/README.md b/README.md
@@ -1,21 +1,49 @@
+> [!warning]
+> This project is currently under development and not ready for public use.
+
 # Ethnolib
 
-This is a small collection of browser components for language apps. Each is published to its own npm package.
+Ethnolib is a small collection of browser components for language apps. Each component may be published to its own npm package.
 
-> [!warning]
-> This project is currently under development and not ready for public use.
+## Components
+
+### [Find-Language](components/language-chooser/common/find-language/README.md)
+
+A package for fuzzy-searching for languages, with language database based on [langtags.json](https://github.com/silnrsi/langtags). It also includes various utilities for working with language tags and language info.
+
+### [Language Chooser React Hook](components/language-chooser/react/common/language-chooser-react-hook/README.md)
 
-## About the monorepo
+A React hook that provides the logic for a language chooser component. It utilizes the `find-language` component.
 
-Ethnolib is a [monorepo](https://nx.dev/concepts/decisions/why-monorepos) using nx.
+### [MUI Language Chooser](components/language-chooser/react/language-chooser-react-mui/README.md)
+
+A MUI styled language chooser interface, initially developed for use in [BloomDesktop](https://github.com/BloomBooks/BloomDesktop). It uses the `language-chooser-react-hook` component.
+
+## Development
+
+Ethnolib is a [monorepo using nx](https://nx.dev/concepts/decisions/why-monorepos), with npm for package management.
 
 We recommend installing nx globally.
-`npm i -g nx`
+`npm i -g nx`. If you prefer not to, you can simply prefix all commands with with `npx` instead.
+
+Nx caches builds for efficiency. To clear the local cache, run `nx reset`.
+
+Use nx to build or run a hot-reload development server. For example, to build or run the MUI language chooser demo:
+
+```
+nx build @ethnolib/language-chooser-react-mui
+```
+
+or
 
-But if you don't, you can just prefix all the commands with `npx`
+```
+nx dev @ethnolib/language-chooser-react-mui
+```
 
-Nx caches builds for efficiency. If at any point you need to clear your local cache, run `nx reset`
+### Dependency Versions
 
-## Language Chooser
+We are currently having all packages manage their own dependencies in their package level `package.json` files, but keeping them all on the same versions of commonly used packages for compatibility. Current versions:
 
-See [language-chooser/README.md](components/language-chooser/README.md)
+    "react": "^17.0.2",
+    "@mui/material": "^5.15.19",
+    "@emotion/react": "^11.11.4",
diff --git a/components/language-chooser/README.md b/components/language-chooser/README.md
diff --git a/components/language-chooser/common/find-language/README.md b/components/language-chooser/common/find-language/README.md
@@ -1,2 +1,147 @@
 > [!warning]
 > This project is currently under development and not ready for public use.
+
+# Find-Language
+
+This component contains the logic for fuzzy-searching languages, designed for use by frontend language choosers. The language database is based on [langtags.json](https://github.com/silnrsi/langtags) and also references [langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt). We use [fuse.js](https://fusejs.io/) for fuzzy-searching.
+
+It also contains various utilities for working with language tags and language information.
+
+This project was initially developed for use in [BloomDesktop](https://github.com/BloomBooks/BloomDesktop).
+
+## Usage
+
+### Installation
+
+`npm i @ethnolib/find-language`
+
+### Searching for languages
+
+Use `searchForLanguage` to search for languages by name (including autonyms, exonyms, or alternative names), associated regions, or ISO 639 tags matching the search string argument. It returns a `FuseResult<ILanguage>[]`, which we recommend passing into a search result modifier. See details in Search Result Modification section.
+
+### Search Result Modification
+
+This package includes various methods for adjusting search results to handle special cases, such as sign languages and very common languages. Currently, edge cases in the search results are adjusted for Bloom’s use case by the `defaultSearchResultModifier`, which:
+
+- Demarcates portions (substrings) of results which match the search string. For example, if the search string is "nglis" then any instance of "English" would be marked as "E[nglis]h"
+- Ensures the English result is the first result when the user starts typing "English"
+- Ensures the French result is the first result when the user starts typing "French", "Francais" or "Français"
+- Simplifies English and French entries by removing region lists and most alternative names
+- Excludes certain langtags.json entries that don't represent specific extant human languages, such as zxx (no linguistic content) or ang (Old English)
+- Filters out Braille and script codes that do not refer to specific relevant scripts from script options
+
+The `searchResultModifiers.ts` file includes various helper methods that can be used to create modifiers suitable for different use cases.
+
+### Macrolanguages
+
+For details of macrolanguage handling, see [macrolanguageNotes.md](macrolanguageNotes.md).
+
+### Example
+
+```
+import {
+  searchForLanguage,
+  defaultSearchResultModifier,
+  stripResultMetadata,
+  ILanguage,
+} from '@ethnolib/find-language';
+import { FuseResult } from 'fuse.js';
+
+    const searchString = "englisj"; //Fuzzy search will still find English
+    const fuseSearchResults: FuseResult<ILanguage>[] = searchForLanguage(searchString);
+    const defaultModifiedSearchResults: ILanguage[] = defaultSearchResultModifier(fuseSearchResults);
+    const unmodifiedSearchResults: ILanguage[] = stripResultMetadata(defaultModifiedSearchResults);
+
+```
+
+In default modification, much of the language info is stripped from the English and French results for simplicity. It also adds bracket demarcation of search string match. See the section on Search Result Modification for details.
+
+`defaultModifiedSearchResults[0]`:
+
+```
+  {
+    "exonym": "[Engl]i[sh]",
+    "iso639_3_code": "eng",
+    "languageSubtag": "en",
+    "regionNames": "",
+    "names": [],
+    "scripts": [
+      {
+        "code": "Latn",
+        "name": "Latin"
+      }
+    ],
+    "variants": "",
+    "alternativeTags": []
+  }
+```
+
+Original English result, `unmodifiedSearchResults[0]` (truncated to save space):
+
+```
+  {
+    "autonym": "English",
+    "exonym": "English",
+    "iso639_3_code": "eng",
+    "languageSubtag": "en",
+    "regionNames": "United States, World, Europe, United Arab Emirates, Antigua and Barbuda, ...",
+    "scripts": [
+      {
+        "code": "Latn",
+        "name": "Latin"
+      },
+      {
+        "code": "Brai",
+        "name": "Braille"
+      },
+      {
+        "code": "Dsrt",
+        "name": "Deseret (Mormon)"
+      },
+      {
+        "code": "Dupl",
+        "name": "Duployan stenography Duployan shorthand"
+      },
+      ...
+    ],
+    "names": [
+      "Anglais",
+      "Angleščina",
+      "Anglisy",
+      "Angličtina",
+      "Anglų",
+      "Angol",
+      ...],
+    "alternativeTags": [
+      "en-Latn",
+      "en-US"
+    ]
+  }
+
+```
+
+## Development
+
+See the main [README](../../../../README.md).
+
+### Language data processing pipeline
+
+If you modify [langtagProcessing.ts](./langtagProcessing.ts), run `npm run find-language/common/langtag-processing` to update [languageData.json](language-data/languageData.json) and [shortestTagLookups.json](language-data/shortestTagLookups.json).
+
+#### ISO-639-3 language consolidation
+
+find-language searches languages included in the ISO-639-3 standard; every result returned will have a unique ISO-639-3 code. The entries listed in our source database, langtags.json, are combinations of languages, scripts, regions, and/or variants. [langtagProcessing.ts](./langtagProcessing.ts) consolidates these entries by their ISO-639-3 code and saves the result to [languageData.json](language-data/languageData.json) for searching. For example, langtags.json has separate entries for Abhaz with Cyrillic script, Abhaz with Georgian script, and Abhaz with Latin script. langtagProcessing.ts will combine these into a single entry which lists all three possible scripts and has the superset of the names, regions, etc. of the three entries from langtags.json. This way the search results will contain at most one entry for the language Abhaz.
+
+#### Language tag shortening
+
+The [createTag](./languageTagUtils.ts) function in this package will return the shortest (and thus preferred) tag for a given language/script/region/dialect combination. For example, given language code "emm" (Mamulique), script code "Latn" (Latin) and region code "MX" (Mexico), `createTag` will return "emm" because it is the preferred equivalent tag for emm-Latn-MX.
+
+[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags. langtagProcessing.ts reformats it into [shortestTagLookups.json](language-data/shortestTagLookups.json) which we use for mapping language tags to their shortest equivalent.
+
+### Unit tests
+
+`Find-language` uses Vitest for unit testing. Use nx to run tests:
+
+```
+nx test @ethnolib/find-language
+```
diff --git a/components/language-chooser/common/find-language/getShortestSufficientLangtag.ts b/components/language-chooser/common/find-language/getShortestSufficientLangtag.ts
@@ -1,4 +1,4 @@
-import shortestTags from "./shortestTagLookups.json" assert { type: "json" };
+import shortestTags from "./language-data/shortestTagLookups.json" assert { type: "json" };
 
 const shortPreferredTagLookup = new Map<string, string>();
 for (const tagset of shortestTags) {

diff --git a/components/language-chooser/common/find-language/langtagProcessing.ts b/components/language-chooser/common/find-language/langtagProcessing.ts
@@ -22,7 +22,10 @@ const scriptNames = iso15924.reduce(
 function getIso639_3CodeDetails() {
   const codeDetails = new Set();
   // downloaded from https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab
-  const codeDetailsFile = fs.readFileSync("iso-639-3.tab", "utf8");
+  const codeDetailsFile = fs.readFileSync(
+    "language-data/iso-639-3.tab",
+    "utf8"
+  );
   for (const line of codeDetailsFile.split("\n")) {
     if (line.length === 0) {
       continue;
@@ -83,7 +86,7 @@ function autonymOrFallback(entry: any, fallback: string) {
   return entry.localnames ? entry.localnames[0] : undefined ?? fallback;
 }
 
-// We want to have one entry for every ISO 630-3 code, whereas langtags.json sometimes has multiple entries per code
+// We want to have one entry for every ISO 639-3 code, whereas langtags.json sometimes has multiple entries per code
 // Combine entry into the entry with matching ISO 630-3 code in langs if there is one, otherwise create a new entry
 function addOrCombineLangtagsEntry(entry, langs) {
   if (!entry.iso639_3) {
@@ -222,7 +225,7 @@ function parseLangtagsJson() {
 }
 
 function parseLangTagsTxt() {
-  const langTagsTxtRaw = fs.readFileSync("langtags.txt", "utf8");
+  const langTagsTxtRaw = fs.readFileSync("language-data/langtags.txt", "utf8");
   const langTagsTxt = langTagsTxtRaw.replaceAll("*", "");
   const lines = langTagsTxt.split("\n");
   const tagLookups = [];
@@ -233,7 +236,10 @@ function parseLangTagsTxt() {
       allTags: tags,
     });
   }
-  fs.writeFileSync("shortestTagLookups.json", JSON.stringify(tagLookups));
+  fs.writeFileSync(
+    "language-data/shortestTagLookups.json",
+    JSON.stringify(tagLookups)
+  );
 }
 
 parseLangtagsJson();

diff --git a/...hooser/common/find-language/iso-639-3.tab → ...find-language/language-data/iso-639-3.tab b/...hooser/common/find-language/iso-639-3.tab → ...find-language/language-data/iso-639-3.tab
diff --git a/...chooser/common/find-language/langtags.txt → .../find-language/language-data/langtags.txt b/...chooser/common/find-language/langtags.txt → .../find-language/language-data/langtags.txt
diff --git a/...mon/find-language/shortestTagLookups.json → ...age/language-data/shortestTagLookups.json b/...mon/find-language/shortestTagLookups.json → ...age/language-data/shortestTagLookups.json
diff --git a/...ts/language-chooser/macrolanguageNotes.md → ...ommon/find-language/macrolanguageNotes.md b/...ts/language-chooser/macrolanguageNotes.md → ...ommon/find-language/macrolanguageNotes.md