Initial commit

kimdwkimdw · kimdwkimdw · commit ffbf1b7139df · 2022-05-16T19:50:40.000+09:00
diff --git a/.github/workflows/ts-publish.yaml b/.github/workflows/ts-publish.yaml
@@ -0,0 +1,30 @@
+name: Publish Lev-eval For TS
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest 
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v3
+      # Setup .npmrc file to publish to GitHub Packages
+      - uses: actions/setup-node@v2
+        with:
+          node-version: '16.x'
+          registry-url: 'https://npm.pkg.github.com'
+      - run: |
+          yarn
+          yarn build
+      - uses: actions/upload-artifact@v2
+        with:
+          name: build
+          path: ./
+      - run: npm publish
+        env:
+          NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,122 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variables file
+.env
+.env.test
+.env.production
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+
+# Next.js build output
+.next
+out
+
+# Nuxt.js build / generate output
+.nuxt
+dist
+
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+
+# vuepress build output
+.vuepress/dist
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# TernJS port file
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+launch.json
+package-lock.json
+
+.npmrc
diff --git a/package.json b/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "@rtzr/lev-eval",
+  "version": "1.0.0",
+  "source": "src/index.ts",
+  "main": "dist/main.js",
+  "module": "dist/module.js",
+  "types": "dist/types.d.ts",
+  "repository": "https://github.com/rtzr/lev-eval",
+  "author": "Arthur Kim",
+  "private": false,
+  "scripts": {
+    "watch": "parcel watch",
+    "build": "parcel build"
+  },
+  "devDependencies": {
+    "@parcel/packager-ts": "2.5.0",
+    "@parcel/transformer-typescript-types": "2.5.0",
+    "parcel": "^2.5.0",
+    "typescript": ">=3.0.0"
+  }
+}
diff --git a/src/index.ts b/src/index.ts
@@ -0,0 +1,2 @@
+export * from './levenstein';
+export * from './nlp_utils';
diff --git a/src/levenstein.ts b/src/levenstein.ts
@@ -0,0 +1,87 @@
+const Op = {
+    Ins: 1,
+    Del: 2,
+    Sub: 3,
+} as const;
+
+const OpNames = Object.fromEntries(Object.entries(Op).map((str, _) => [str[1], str[0]]));
+
+export const levenstein = (sourceTokens: string[], targetTokens: string[]) => {
+    if (sourceTokens.length === 0 || targetTokens.length === 0) {
+        const dist = Math.max(sourceTokens.length, targetTokens.length);
+        return { distance: dist, editops: [], ratio: dist / targetTokens.length };
+    }
+
+    // add blank for computation
+    sourceTokens.unshift("");
+    targetTokens.unshift("");
+
+    const [m, n] = [sourceTokens.length, targetTokens.length];
+    const d: number[][] = Array(m + 1)
+        .fill(undefined)
+        .map(() => Array(n + 1).fill(0));
+
+    // for avoiding memory copy of previous history, use any[][] for history
+    const history: any[][] = new Array(m + 1).fill(undefined).map(() =>
+        Array(n + 1)
+            .fill(undefined)
+            .map(_ => [undefined])
+    );
+    for (let i = 1; i <= m; i++) {
+        d[i][0] = i;
+        history[i][0] = new Array(i).fill(i).map((elem, idx) => [elem, idx, "ins"]);
+    }
+
+    for (let j = 1; j <= n; j++) {
+        d[0][j] = j;
+        history[0][j] = new Array(j).fill(j).map((elem, idx) => [idx, elem, "ins"]);
+    }
+
+    for (let j = 1; j <= n; j++) {
+        for (let i = 1; i <= m; i++) {
+            const substitutionCost = sourceTokens[i] === targetTokens[j] ? 0 : 1;
+            const values = [
+                { v: d[i][j - 1] + 1, op: Op.Ins, his: history[i][j - 1] },
+                { v: d[i - 1][j] + 1, op: Op.Del, his: history[i - 1][j] },
+                { v: d[i - 1][j - 1] + substitutionCost, op: Op.Sub, his: history[i - 1][j - 1] },
+            ].sort((v1, v2) =>
+                v1.v > v2.v ||
+                /**
+                 * insertion > deletion > substitution
+                 */
+                (v1.v === v2.v && (v1.op === Op.Ins || (v1.op === Op.Del && v2.op != Op.Del)))
+                    ? 1
+                    : -1
+            );
+
+            d[i][j] = values[0].v;
+            history[i][j] = [values[0].his];
+            if (values[0].op !== Op.Sub || substitutionCost == 1) {
+                history[i][j].push([i, j, values[0].op]);
+            }
+        }
+    }
+
+    // unwrap result from history
+    let result: any[][] = history[m][n];
+    while (result[0] && typeof result[0][0] !== "number") {
+        try {
+            const remainders = result.splice(1);
+            result = result[0].concat(remainders);
+            if (result.length == 87) {
+                console.error("result.length == 87");
+            }
+        } catch (e) {
+            console.error(e);
+        }
+    }
+
+    const final_ops = result.splice(1).map(elem => [elem[0], elem[1], OpNames[elem[2]]]);
+
+    return {
+        distance: d[m][n],
+        editops: final_ops,
+        // (source.length - 1) for efficient computation
+        ratio: (d[m][n] / (sourceTokens.length - 1)) * 100.0,
+    };
+};
diff --git a/src/nlp_utils.ts b/src/nlp_utils.ts
@@ -0,0 +1,112 @@
+import { levenstein } from "./levenstein";
+
+const cer = (a: string, b: string) => {
+    const replaced_a = a.replace(/\s/g, ``).match(/./g) || [];
+    const replaced_b = b.replace(/\s/g, ``).match(/./g) || [];
+
+    const result = levenstein(replaced_a, replaced_b);
+    return result;
+};
+
+const wer = (a: string, b: string) => {
+    const replaced_a = a.split(/\s+/g); //.match(/./g)
+    const replaced_b = b.split(/\s+/g); //.match(/./g)
+
+    const result = levenstein(replaced_a, replaced_b);
+    return result;
+};
+
+/**
+     * 
+     * sourceTokens
+     * ['안', '녕', '하', '세', '요', '저', '는', '아', '서', '입', '니', '다', 'a', 's', 'd', 'f', 'a', 's', 'd', 'f', 'a']
+     *
+     * targetTokens 
+     * ['안', '녕', '하', '세', '요', '저', '는', '아', '써', '입', '니', '다']
+     * 
+     * 
+     * edit ops
+     * 0:(0) []
+1:(3) [9, 9, 'sub']
+2:(3) [13, 12, 'del']
+3:(3) [14, 12, 'del']
+4:(3) [15, 12, 'del']
+5:(3) [16, 12, 'del']
+6:(3) [17, 12, 'del']
+7:(3) [18, 12, 'del']
+8:(3) [19, 12, 'del']
+9:(3) [20, 12, 'del']
+10:(3) [21, 12, 'del']
+     */
+const align_string = (ref: string, hyp: string, _editops: any[][], slice_view = 50) => {
+    const ref_array = ref.replace(/\s/g, ``).match(/./g) || [];
+    const hyp_array = hyp.replace(/\s/g, ``).match(/./g) || [];
+    const ref_space_indices = Array.from(ref.replace(/\s+/g, " ").matchAll(/\s/g)).map(
+        el => el.index
+    );
+
+    let ref_added = 0;
+    let hyp_added = 0;
+    const ref_added_index = [];
+    const ref_sub_index = [];
+    for (const op of _editops) {
+        // [ref_idx, hyp_inx, op_type]
+        //  🟩🟩🟨⬜
+
+        switch (op[2]) {
+            case "Ins":
+                ref_added_index.push(op[0] + ref_added);
+                ref_array.splice(op[0] + ref_added, 0, "🟩");
+                ref_added += 1;
+                break;
+            case "Sub":
+                ref_sub_index.push(op[0] - 1 + ref_added);
+                break;
+            case "Del":
+                hyp_array.splice(op[1] + hyp_added, 0, "🟨");
+                hyp_added += 1;
+                break;
+        }
+    }
+
+    for (let i = 0; i < ref_space_indices.length; i++) {
+        ref_space_indices[i] += ref_added_index.filter(el => el < ref_space_indices[i] - i).length;
+    }
+
+    const BIG_SPACE = "\u3000";
+    const new_sub_array = ref_array.map((el, idx) => {
+        if (ref_sub_index.includes(idx)) {
+            return "⬜";
+        }
+        return BIG_SPACE;
+    }) as string[];
+    for (let i = ref_space_indices.length - 1; i >= 0; i--) {
+        ref_array.splice(ref_space_indices[i] - i, 0, BIG_SPACE);
+        hyp_array.splice(ref_space_indices[i] - i, 0, BIG_SPACE);
+
+        new_sub_array.splice(ref_space_indices[i] - i, 0, BIG_SPACE);
+    }
+
+    function get_n_sliced(arr: string[], size: number) {
+        const sliced = [];
+        const tmp_arr = Array.from(arr);
+        while (tmp_arr.length > 0) sliced.push(tmp_arr.splice(0, size).join(""));
+        return sliced;
+    }
+    // Ideographic Space U+3000
+    const ref_sliced = get_n_sliced(ref_array, slice_view).map(el =>
+        el.replaceAll(/([A-Z0-9])/gi, "$1 ")
+    );
+    const hyp_sliced = get_n_sliced(hyp_array, slice_view).map(el =>
+        el.replaceAll(/([A-Z0-9])/gi, "$1 ")
+    );
+    const sub_sliced = get_n_sliced(new_sub_array, slice_view);
+
+    return [ref_sliced, hyp_sliced, sub_sliced];
+};
+
+export const utils = {
+    cer,
+    wer,
+    align_string,
+};
diff --git a/yarn.lock b/yarn.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+export * from './levenstein';`
	`2`	`+export * from './nlp_utils';`