Skip to content

Commit ffbf1b7

Browse files
committed
Initial commit
0 parents  commit ffbf1b7

File tree

7 files changed

+1752
-0
lines changed

7 files changed

+1752
-0
lines changed

.github/workflows/ts-publish.yaml

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Publish Lev-eval For TS
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
jobs:
9+
publish:
10+
runs-on: ubuntu-latest
11+
permissions:
12+
contents: read
13+
packages: write
14+
steps:
15+
- uses: actions/checkout@v3
16+
# Setup .npmrc file to publish to GitHub Packages
17+
- uses: actions/setup-node@v2
18+
with:
19+
node-version: '16.x'
20+
registry-url: 'https://npm.pkg.github.com'
21+
- run: |
22+
yarn
23+
yarn build
24+
- uses: actions/upload-artifact@v2
25+
with:
26+
name: build
27+
path: ./
28+
- run: npm publish
29+
env:
30+
NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.gitignore

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# Logs
2+
logs
3+
*.log
4+
npm-debug.log*
5+
yarn-debug.log*
6+
yarn-error.log*
7+
lerna-debug.log*
8+
.pnpm-debug.log*
9+
10+
# Diagnostic reports (https://nodejs.org/api/report.html)
11+
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
12+
13+
# Runtime data
14+
pids
15+
*.pid
16+
*.seed
17+
*.pid.lock
18+
19+
# Directory for instrumented libs generated by jscoverage/JSCover
20+
lib-cov
21+
22+
# Coverage directory used by tools like istanbul
23+
coverage
24+
*.lcov
25+
26+
# nyc test coverage
27+
.nyc_output
28+
29+
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
30+
.grunt
31+
32+
# Bower dependency directory (https://bower.io/)
33+
bower_components
34+
35+
# node-waf configuration
36+
.lock-wscript
37+
38+
# Compiled binary addons (https://nodejs.org/api/addons.html)
39+
build/Release
40+
41+
# Dependency directories
42+
node_modules/
43+
jspm_packages/
44+
45+
# Snowpack dependency directory (https://snowpack.dev/)
46+
web_modules/
47+
48+
# TypeScript cache
49+
*.tsbuildinfo
50+
51+
# Optional npm cache directory
52+
.npm
53+
54+
# Optional eslint cache
55+
.eslintcache
56+
57+
# Microbundle cache
58+
.rpt2_cache/
59+
.rts2_cache_cjs/
60+
.rts2_cache_es/
61+
.rts2_cache_umd/
62+
63+
# Optional REPL history
64+
.node_repl_history
65+
66+
# Output of 'npm pack'
67+
*.tgz
68+
69+
# Yarn Integrity file
70+
.yarn-integrity
71+
72+
# dotenv environment variables file
73+
.env
74+
.env.test
75+
.env.production
76+
77+
# parcel-bundler cache (https://parceljs.org/)
78+
.cache
79+
.parcel-cache
80+
81+
# Next.js build output
82+
.next
83+
out
84+
85+
# Nuxt.js build / generate output
86+
.nuxt
87+
dist
88+
89+
# Gatsby files
90+
.cache/
91+
# Comment in the public line in if your project uses Gatsby and not Next.js
92+
# https://nextjs.org/blog/next-9-1#public-directory-support
93+
# public
94+
95+
# vuepress build output
96+
.vuepress/dist
97+
98+
# Serverless directories
99+
.serverless/
100+
101+
# FuseBox cache
102+
.fusebox/
103+
104+
# DynamoDB Local files
105+
.dynamodb/
106+
107+
# TernJS port file
108+
.tern-port
109+
110+
# Stores VSCode versions used for testing VSCode extensions
111+
.vscode-test
112+
113+
# yarn v2
114+
.yarn/cache
115+
.yarn/unplugged
116+
.yarn/build-state.yml
117+
.yarn/install-state.gz
118+
.pnp.*
119+
launch.json
120+
package-lock.json
121+
122+
.npmrc

package.json

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"name": "@rtzr/lev-eval",
3+
"version": "1.0.0",
4+
"source": "src/index.ts",
5+
"main": "dist/main.js",
6+
"module": "dist/module.js",
7+
"types": "dist/types.d.ts",
8+
"repository": "https://github.com/rtzr/lev-eval",
9+
"author": "Arthur Kim",
10+
"private": false,
11+
"scripts": {
12+
"watch": "parcel watch",
13+
"build": "parcel build"
14+
},
15+
"devDependencies": {
16+
"@parcel/packager-ts": "2.5.0",
17+
"@parcel/transformer-typescript-types": "2.5.0",
18+
"parcel": "^2.5.0",
19+
"typescript": ">=3.0.0"
20+
}
21+
}

src/index.ts

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
export * from './levenstein';
2+
export * from './nlp_utils';

src/levenstein.ts

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
const Op = {
2+
Ins: 1,
3+
Del: 2,
4+
Sub: 3,
5+
} as const;
6+
7+
const OpNames = Object.fromEntries(Object.entries(Op).map((str, _) => [str[1], str[0]]));
8+
9+
export const levenstein = (sourceTokens: string[], targetTokens: string[]) => {
10+
if (sourceTokens.length === 0 || targetTokens.length === 0) {
11+
const dist = Math.max(sourceTokens.length, targetTokens.length);
12+
return { distance: dist, editops: [], ratio: dist / targetTokens.length };
13+
}
14+
15+
// add blank for computation
16+
sourceTokens.unshift("");
17+
targetTokens.unshift("");
18+
19+
const [m, n] = [sourceTokens.length, targetTokens.length];
20+
const d: number[][] = Array(m + 1)
21+
.fill(undefined)
22+
.map(() => Array(n + 1).fill(0));
23+
24+
// for avoiding memory copy of previous history, use any[][] for history
25+
const history: any[][] = new Array(m + 1).fill(undefined).map(() =>
26+
Array(n + 1)
27+
.fill(undefined)
28+
.map(_ => [undefined])
29+
);
30+
for (let i = 1; i <= m; i++) {
31+
d[i][0] = i;
32+
history[i][0] = new Array(i).fill(i).map((elem, idx) => [elem, idx, "ins"]);
33+
}
34+
35+
for (let j = 1; j <= n; j++) {
36+
d[0][j] = j;
37+
history[0][j] = new Array(j).fill(j).map((elem, idx) => [idx, elem, "ins"]);
38+
}
39+
40+
for (let j = 1; j <= n; j++) {
41+
for (let i = 1; i <= m; i++) {
42+
const substitutionCost = sourceTokens[i] === targetTokens[j] ? 0 : 1;
43+
const values = [
44+
{ v: d[i][j - 1] + 1, op: Op.Ins, his: history[i][j - 1] },
45+
{ v: d[i - 1][j] + 1, op: Op.Del, his: history[i - 1][j] },
46+
{ v: d[i - 1][j - 1] + substitutionCost, op: Op.Sub, his: history[i - 1][j - 1] },
47+
].sort((v1, v2) =>
48+
v1.v > v2.v ||
49+
/**
50+
* insertion > deletion > substitution
51+
*/
52+
(v1.v === v2.v && (v1.op === Op.Ins || (v1.op === Op.Del && v2.op != Op.Del)))
53+
? 1
54+
: -1
55+
);
56+
57+
d[i][j] = values[0].v;
58+
history[i][j] = [values[0].his];
59+
if (values[0].op !== Op.Sub || substitutionCost == 1) {
60+
history[i][j].push([i, j, values[0].op]);
61+
}
62+
}
63+
}
64+
65+
// unwrap result from history
66+
let result: any[][] = history[m][n];
67+
while (result[0] && typeof result[0][0] !== "number") {
68+
try {
69+
const remainders = result.splice(1);
70+
result = result[0].concat(remainders);
71+
if (result.length == 87) {
72+
console.error("result.length == 87");
73+
}
74+
} catch (e) {
75+
console.error(e);
76+
}
77+
}
78+
79+
const final_ops = result.splice(1).map(elem => [elem[0], elem[1], OpNames[elem[2]]]);
80+
81+
return {
82+
distance: d[m][n],
83+
editops: final_ops,
84+
// (source.length - 1) for efficient computation
85+
ratio: (d[m][n] / (sourceTokens.length - 1)) * 100.0,
86+
};
87+
};

src/nlp_utils.ts

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import { levenstein } from "./levenstein";
2+
3+
const cer = (a: string, b: string) => {
4+
const replaced_a = a.replace(/\s/g, ``).match(/./g) || [];
5+
const replaced_b = b.replace(/\s/g, ``).match(/./g) || [];
6+
7+
const result = levenstein(replaced_a, replaced_b);
8+
return result;
9+
};
10+
11+
const wer = (a: string, b: string) => {
12+
const replaced_a = a.split(/\s+/g); //.match(/./g)
13+
const replaced_b = b.split(/\s+/g); //.match(/./g)
14+
15+
const result = levenstein(replaced_a, replaced_b);
16+
return result;
17+
};
18+
19+
/**
20+
*
21+
* sourceTokens
22+
* ['안', '녕', '하', '세', '요', '저', '는', '아', '서', '입', '니', '다', 'a', 's', 'd', 'f', 'a', 's', 'd', 'f', 'a']
23+
*
24+
* targetTokens
25+
* ['안', '녕', '하', '세', '요', '저', '는', '아', '써', '입', '니', '다']
26+
*
27+
*
28+
* edit ops
29+
* 0:(0) []
30+
1:(3) [9, 9, 'sub']
31+
2:(3) [13, 12, 'del']
32+
3:(3) [14, 12, 'del']
33+
4:(3) [15, 12, 'del']
34+
5:(3) [16, 12, 'del']
35+
6:(3) [17, 12, 'del']
36+
7:(3) [18, 12, 'del']
37+
8:(3) [19, 12, 'del']
38+
9:(3) [20, 12, 'del']
39+
10:(3) [21, 12, 'del']
40+
*/
41+
const align_string = (ref: string, hyp: string, _editops: any[][], slice_view = 50) => {
42+
const ref_array = ref.replace(/\s/g, ``).match(/./g) || [];
43+
const hyp_array = hyp.replace(/\s/g, ``).match(/./g) || [];
44+
const ref_space_indices = Array.from(ref.replace(/\s+/g, " ").matchAll(/\s/g)).map(
45+
el => el.index
46+
);
47+
48+
let ref_added = 0;
49+
let hyp_added = 0;
50+
const ref_added_index = [];
51+
const ref_sub_index = [];
52+
for (const op of _editops) {
53+
// [ref_idx, hyp_inx, op_type]
54+
// 🟩🟩🟨⬜
55+
56+
switch (op[2]) {
57+
case "Ins":
58+
ref_added_index.push(op[0] + ref_added);
59+
ref_array.splice(op[0] + ref_added, 0, "🟩");
60+
ref_added += 1;
61+
break;
62+
case "Sub":
63+
ref_sub_index.push(op[0] - 1 + ref_added);
64+
break;
65+
case "Del":
66+
hyp_array.splice(op[1] + hyp_added, 0, "🟨");
67+
hyp_added += 1;
68+
break;
69+
}
70+
}
71+
72+
for (let i = 0; i < ref_space_indices.length; i++) {
73+
ref_space_indices[i] += ref_added_index.filter(el => el < ref_space_indices[i] - i).length;
74+
}
75+
76+
const BIG_SPACE = "\u3000";
77+
const new_sub_array = ref_array.map((el, idx) => {
78+
if (ref_sub_index.includes(idx)) {
79+
return "⬜";
80+
}
81+
return BIG_SPACE;
82+
}) as string[];
83+
for (let i = ref_space_indices.length - 1; i >= 0; i--) {
84+
ref_array.splice(ref_space_indices[i] - i, 0, BIG_SPACE);
85+
hyp_array.splice(ref_space_indices[i] - i, 0, BIG_SPACE);
86+
87+
new_sub_array.splice(ref_space_indices[i] - i, 0, BIG_SPACE);
88+
}
89+
90+
function get_n_sliced(arr: string[], size: number) {
91+
const sliced = [];
92+
const tmp_arr = Array.from(arr);
93+
while (tmp_arr.length > 0) sliced.push(tmp_arr.splice(0, size).join(""));
94+
return sliced;
95+
}
96+
// Ideographic Space U+3000
97+
const ref_sliced = get_n_sliced(ref_array, slice_view).map(el =>
98+
el.replaceAll(/([A-Z0-9])/gi, "$1 ")
99+
);
100+
const hyp_sliced = get_n_sliced(hyp_array, slice_view).map(el =>
101+
el.replaceAll(/([A-Z0-9])/gi, "$1 ")
102+
);
103+
const sub_sliced = get_n_sliced(new_sub_array, slice_view);
104+
105+
return [ref_sliced, hyp_sliced, sub_sliced];
106+
};
107+
108+
export const utils = {
109+
cer,
110+
wer,
111+
align_string,
112+
};

0 commit comments

Comments
 (0)