Skip to content

Commit dcced95

Browse files
authored
Merge pull request #28 from pluots/stringops
Add the levenshtein string similarity algorithm
2 parents 285e1bd + 0ab2bc2 commit dcced95

File tree

8 files changed

+463
-43
lines changed

8 files changed

+463
-43
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
### Added
88

9-
### Changed
9+
Add the `levenshtein` string distance algorithm.
1010

1111

1212
## [0.1.10] - 2023-10-05

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ members = [
77
"udf-lipsum",
88
"udf-uuid",
99
"test-integration",
10+
"udf-stringops",
1011
]
1112

1213
[profile.release]

Dockerfile

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99
# docker exec -it mdb-udf-suite-c mariadb -pexample
1010
# ```
1111

12-
FROM rust:latest AS build
13-
14-
ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
12+
FROM rust:1.76 AS build
1513

1614
WORKDIR /build
1715

README.md

Lines changed: 184 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,13 @@ The following UDFs are includes:
1818
`xxhash`
1919
- [IP Functions](#ip-address-functions) for interop: `ip_validate`,
2020
`ip_to_canonical`, `ip_to_ipv4_mapped`
21+
- [String Operations](#string-operations): Calculations such as Levenshtein
22+
edit distance, including limited and normalized versions.
2123
- [Jsonify](#jsonify): convert any data to JSON
2224
- [Lipsum](#lipsum): generate random text
2325

26+
See the relevant section for more information.
27+
2428
### UUID
2529

2630
Provide UUID functions similar to the Postges [`uuid-osp`] package:
@@ -34,6 +38,29 @@ Provide UUID functions similar to the Postges [`uuid-osp`] package:
3438

3539
See the [UUID Readme](/udf-uuid/README.md) for more information
3640

41+
#### Usage
42+
43+
```text
44+
note: type uuid is type string
45+
uuid_generate_v1() -> uuid
46+
uuid_generate_v1mc() -> uuid
47+
uuid_generate_v4() -> uuid
48+
uuid_generate_v6([node_addr: string]) -> uuid
49+
uuid_generate_v7() -> uuid
50+
uuid_nil() -> uuid
51+
uuid_max() -> uuid
52+
uuid_ns_dns() -> string
53+
uuid_ns_url() -> string
54+
uuid_ns_oid() -> string
55+
uuid_ns_x500() -> string
56+
uuid_is_valid(uuid: uuid) -> boolean
57+
uuid_to_bin(uuid: uuid) -> bytes
58+
uuid_from_bin() -> uuid
59+
bin_from_uuid() -> uuid
60+
```
61+
62+
#### Examples
63+
3764
```text
3865
MariaDB [(none)]> select uuid_generate_v6();
3966
+--------------------------------------+
@@ -54,7 +81,7 @@ MariaDB [(none)]> select hex(uuid_to_bin(uuid_generate_v4()));
5481

5582
[`uuid-osp`]: https://www.postgresql.org/docs/current/uuid-ossp.html
5683

57-
## Hash Algorithms
84+
### Hash Algorithms
5885

5986
This library provides the following functions:
6087

@@ -68,11 +95,54 @@ This library provides the following functions:
6895
- `xxhash3`, `xxhash32`, `xxhash64`, `xxhash` (`xxhash` is an alias for
6996
`xxhash64`)
7097

71-
All of these return hex strings by defaulti. `_bin` functions are also
98+
All of these return hex strings by default. `_bin` functions are also
7299
provided that return the binary result without going through hexification,
73100
suitable for storage in a `BINARY(X)` column.
74101

75102

103+
#### Usage
104+
105+
```text
106+
blake2b512(a: any [, ...]) -> string
107+
blake2b512_bin(a: any [, ...]) -> bytes
108+
blake2s512(a: any [, ...]) -> string
109+
blake2s512_bin(a: any [, ...]) -> bytes
110+
blake3(a: any [, ...]) -> string
111+
blake3_bin(a: any [, ...]) -> bytes
112+
blake3_thd(a: any [, ...]) -> string
113+
blake3_thd_bin(a: any [, ...]) -> bytes
114+
md5_u(a: any [, ...]) -> string
115+
md5_u_bin(a: any [, ...]) -> bytes
116+
sha1_u(a: any [, ...]) -> string
117+
sha1_u_bin(a: any [, ...]) -> bytes
118+
sha224(a: any [, ...]) -> string
119+
sha224_bin(a: any [, ...]) -> bytes
120+
sha256(a: any [, ...]) -> string
121+
sha256_bin(a: any [, ...]) -> bytes
122+
sha384(a: any [, ...]) -> string
123+
sha384_bin(a: any [, ...]) -> bytes
124+
sha512(a: any [, ...]) -> string
125+
sha512_bin(a: any [, ...]) -> bytes
126+
keccak224(a: any [, ...]) -> string
127+
keccak224_bin(a: any [, ...]) -> bytes
128+
keccak256(a: any [, ...]) -> string
129+
keccak256_bin(a: any [, ...]) -> bytes
130+
sha3_224(a: any [, ...]) -> string
131+
sha3_224_bin(a: any [, ...]) -> bytes
132+
sha3_256(a: any [, ...]) -> string
133+
sha3_256_bin(a: any [, ...]) -> bytes
134+
sha3_384(a: any [, ...]) -> string
135+
sha3_384_bin(a: any [, ...]) -> bytes
136+
sha3_512(a: any [, ...]) -> string
137+
sha3_512_bin(a: any [, ...]) -> bytes
138+
xxhash(a: any [, ...]) -> integer
139+
xxhash3(a: any [, ...]) -> integer
140+
xxhash32(a: any [, ...]) -> integer
141+
xxhash64(a: any [, ...]) -> integer
142+
```
143+
144+
#### Examples
145+
76146
```text
77147
MariaDB [(none)]> select blake3("Hello, world!");
78148
+------------------------------------------------------------------+
@@ -114,15 +184,67 @@ MariaDB [(none)]> select xxhash('Hello, ', 0x77, 'orld', '!');
114184

115185
Note that in SQL, all integers are an `i64`, all floats are a `f64`, and all
116186
decimals are represented as a string to the UDF API. This library hashes these
117-
types as their little endian representation. (You only need to worry about this
118-
if you have very obscure platform compatibility requirements, and strings and
119-
blobs are always unambiguous).
187+
types as their little endian representation on all platforms. (You only need
188+
to worry about this if you have very obscure platform compatibility
189+
requirements. Strings and blobs are always unambiguous).
190+
191+
### String Operationg
192+
193+
Provide the function `levenshtein`, which calculates the levenshtein edit
194+
distance between two strings. There is also `levenshtein_normalized` that
195+
returns a value between 0.0 (identical) and 1.0 (significantly different).
196+
197+
If a limit is provided as a third argument, the operation will terminate if
198+
that limit is exceeded. This can help to improve performance if filtering
199+
dissimilar strings.
200+
201+
These algorithms provide a _byte_ edit distance, rather than unicode chars or
202+
graphemes. These options may be added in the future.
203+
204+
These algorithms are implemented by the [`rapidfuzz`] crate.
205+
206+
[`rapidfuzz`]: https://crates.io/crates/rapidfuzz)
207+
208+
#### Usage
209+
210+
```text
211+
levenshtein(a: str, b: str [, limit: integer]) -> integer;
212+
levenshtein_normalized(a: str, b: str [, limit: real]) -> real;
213+
```
214+
215+
#### Example
216+
217+
```text
218+
MariaDB [(none)]> SELECT levenshtein('foo', 'moose'), levenshtein_normalized('foo', 'moos');
219+
+-----------------------------+---------------------------------------+
220+
| levenshtein('foo', 'moose') | levenshtein_normalized('foo', 'moos') |
221+
+-----------------------------+---------------------------------------+
222+
| 3 | 0.5 |
223+
+-----------------------------+---------------------------------------+
224+
1 row in set (0.001 sec)
225+
226+
MariaDB [(none)]> SELECT levenshtein('foo', 'moose', 2), levenshtein_normalized('foo', 'moos', 0.3);
227+
+--------------------------------+--------------------------------------------+
228+
| levenshtein('foo', 'moose', 2) | levenshtein_normalized('foo', 'moos', 0.3) |
229+
+--------------------------------+--------------------------------------------+
230+
| 2 | 0.3 |
231+
+--------------------------------+--------------------------------------------+
232+
1 row in set (0.001 sec)
233+
```
120234

121235
### Jsonify
122236

123237
Provide the function `jsonify`, which quickly creates JSON output for any given
124238
inputs.
125239

240+
#### Usage
241+
242+
```text
243+
jsonify(a: any [, ...]) -> string
244+
```
245+
246+
#### Examples
247+
126248
```text
127249
MariaDB [db]> select jsonify(qty, cost, class) from t1 limit 4;
128250
+-------------------------------------+
@@ -155,6 +277,14 @@ MariaDB [db]> select jsonify(uuid() as uuid, qty as quantity, cost) from t1 limi
155277

156278
Uses the [lipsum crate] to generate lipsum strings with a specified word count.
157279

280+
#### Usage
281+
282+
```text
283+
lipsum(count: integer [, seed: integer]) -> string
284+
```
285+
286+
#### Examples
287+
158288

159289
```text
160290
MariaDB [(none)]> select lipsum(10);
@@ -168,7 +298,7 @@ MariaDB [(none)]> select lipsum(10);
168298

169299
[lipsum crate]: https://docs.rs/lipsum/latest/lipsum/
170300

171-
## IP Address Functions
301+
### IP Address Functions
172302

173303
We provide three IP functions:
174304

@@ -177,7 +307,18 @@ We provide three IP functions:
177307
- `ip_to_ipv6_mapped` which converts ipv4 addresses to their ipv6 form (e.g.
178308
for interop with the `INET6` data type)
179309
- `ip_to_canonical` which reverses the mapping operation
310+
311+
#### Usage
312+
313+
```text
314+
ip_validate(ip: string) -> string
315+
ip_to_canonical(ip: string) -> string
316+
ip_to_ipv6_mapped(ip: string) -> string
180317
```
318+
319+
#### Examples
320+
321+
```text
181322
MariaDB [db]> select
182323
-> input,
183324
-> ip_validate(input),
@@ -205,11 +346,12 @@ The desired files can be copied to the plugin directory (usually
205346
`/usr/lib/mysql/plugin`) and selectively loaded:
206347

207348
```sql
349+
-- **** Hash functions ****
208350
CREATE OR REPLACE FUNCTION blake2b512 RETURNS string SONAME 'libudf_hash.so';
209351
CREATE OR REPLACE FUNCTION blake2s256 RETURNS string SONAME 'libudf_hash.so';
210352
CREATE OR REPLACE FUNCTION blake3 RETURNS string SONAME 'libudf_hash.so';
211353
CREATE OR REPLACE FUNCTION blake3_thd RETURNS string SONAME 'libudf_hash.so';
212-
-- the md5 and sha functions have builtin versions
354+
-- the md5 and sha functions have builtin versions, hence the `_u` suffix
213355
CREATE OR REPLACE FUNCTION md5_u RETURNS string SONAME 'libudf_hash.so';
214356
CREATE OR REPLACE FUNCTION sha1_u RETURNS string SONAME 'libudf_hash.so';
215357
CREATE OR REPLACE FUNCTION sha224 RETURNS string SONAME 'libudf_hash.so';
@@ -221,7 +363,6 @@ CREATE OR REPLACE FUNCTION keccak256 RETURNS string SONAME 'libudf_hash.so';
221363
CREATE OR REPLACE FUNCTION sha3_224 RETURNS string SONAME 'libudf_hash.so';
222364
CREATE OR REPLACE FUNCTION sha3_256 RETURNS string SONAME 'libudf_hash.so';
223365
CREATE OR REPLACE FUNCTION sha3_384 RETURNS string SONAME 'libudf_hash.so';
224-
CREATE OR REPLACE FUNCTION sha3_384_bin RETURNS string SONAME 'libudf_hash.so';
225366
CREATE OR REPLACE FUNCTION sha3_512 RETURNS string SONAME 'libudf_hash.so';
226367
CREATE OR REPLACE FUNCTION xxhash RETURNS integer SONAME 'libudf_hash.so';
227368
CREATE OR REPLACE FUNCTION xxhash3 RETURNS integer SONAME 'libudf_hash.so';
@@ -245,36 +386,41 @@ CREATE OR REPLACE FUNCTION keccak224_bin RETURNS string SONAME 'libudf_hash.so';
245386
CREATE OR REPLACE FUNCTION keccak256_bin RETURNS string SONAME 'libudf_hash.so';
246387
CREATE OR REPLACE FUNCTION sha3_224_bin RETURNS string SONAME 'libudf_hash.so';
247388
CREATE OR REPLACE FUNCTION sha3_256_bin RETURNS string SONAME 'libudf_hash.so';
389+
CREATE OR REPLACE FUNCTION sha3_384_bin RETURNS string SONAME 'libudf_hash.so';
248390
CREATE OR REPLACE FUNCTION sha3_512_bin RETURNS string SONAME 'libudf_hash.so';
249391

250-
-- JSON creation function
251-
CREATE FUNCTION jsonify RETURNS string SONAME 'libudf_jsonify.so';
252-
253-
-- IP functions
254-
CREATE FUNCTION ip_validate RETURNS string SONAME 'libudf_net.so';
255-
CREATE FUNCTION ip_to_canonical RETURNS string SONAME 'libudf_net.so';
256-
CREATE FUNCTION ip_to_ipv6_mapped RETURNS string SONAME 'libudf_net.so';
257-
258-
-- random string generation
259-
CREATE FUNCTION lipsum RETURNS string SONAME 'libudf_lipsum.so';
260-
261-
-- UUID interfaces
262-
CREATE FUNCTION uuid_generate_v1 RETURNS string SONAME 'libudf_uuid.so';
263-
CREATE FUNCTION uuid_generate_v1mc RETURNS string SONAME 'libudf_uuid.so';
264-
CREATE FUNCTION uuid_generate_v4 RETURNS string SONAME 'libudf_uuid.so';
265-
CREATE FUNCTION uuid_generate_v6 RETURNS string SONAME 'libudf_uuid.so';
266-
CREATE FUNCTION uuid_generate_v7 RETURNS string SONAME 'libudf_uuid.so';
267-
CREATE FUNCTION uuid_nil RETURNS string SONAME 'libudf_uuid.so';
268-
CREATE FUNCTION uuid_max RETURNS string SONAME 'libudf_uuid.so';
269-
CREATE FUNCTION uuid_ns_dns RETURNS string SONAME 'libudf_uuid.so';
270-
CREATE FUNCTION uuid_ns_url RETURNS string SONAME 'libudf_uuid.so';
271-
CREATE FUNCTION uuid_ns_oid RETURNS string SONAME 'libudf_uuid.so';
272-
CREATE FUNCTION uuid_ns_x500 RETURNS string SONAME 'libudf_uuid.so';
273-
CREATE FUNCTION uuid_is_valid RETURNS integer SONAME 'libudf_uuid.so';
274-
CREATE FUNCTION uuid_to_bin RETURNS string SONAME 'libudf_uuid.so';
275-
CREATE FUNCTION uuid_from_bin RETURNS string SONAME 'libudf_uuid.so';
392+
-- **** JSON creation function ****
393+
CREATE OR REPLACE FUNCTION jsonify RETURNS string SONAME 'libudf_jsonify.so';
394+
395+
-- **** IP functions ****
396+
CREATE OR REPLACE FUNCTION ip_validate RETURNS string SONAME 'libudf_net.so';
397+
CREATE OR REPLACE FUNCTION ip_to_canonical RETURNS string SONAME 'libudf_net.so';
398+
CREATE OR REPLACE FUNCTION ip_to_ipv6_mapped RETURNS string SONAME 'libudf_net.so';
399+
400+
-- **** string operation functions ****
401+
CREATE OR REPLACE FUNCTION levenshtein RETURNS integer SONAME 'libudf_stringops.so'
402+
CREATE OR REPLACE FUNCTION levenshtein_normalized RETURNS real SONAME 'libudf_stringops.so'
403+
404+
-- **** random string generation ****
405+
CREATE OR REPLACE FUNCTION lipsum RETURNS string SONAME 'libudf_lipsum.so';
406+
407+
-- **** UUID interfaces ****
408+
CREATE OR REPLACE FUNCTION uuid_generate_v1 RETURNS string SONAME 'libudf_uuid.so';
409+
CREATE OR REPLACE FUNCTION uuid_generate_v1mc RETURNS string SONAME 'libudf_uuid.so';
410+
CREATE OR REPLACE FUNCTION uuid_generate_v4 RETURNS string SONAME 'libudf_uuid.so';
411+
CREATE OR REPLACE FUNCTION uuid_generate_v6 RETURNS string SONAME 'libudf_uuid.so';
412+
CREATE OR REPLACE FUNCTION uuid_generate_v7 RETURNS string SONAME 'libudf_uuid.so';
413+
CREATE OR REPLACE FUNCTION uuid_nil RETURNS string SONAME 'libudf_uuid.so';
414+
CREATE OR REPLACE FUNCTION uuid_max RETURNS string SONAME 'libudf_uuid.so';
415+
CREATE OR REPLACE FUNCTION uuid_ns_dns RETURNS string SONAME 'libudf_uuid.so';
416+
CREATE OR REPLACE FUNCTION uuid_ns_url RETURNS string SONAME 'libudf_uuid.so';
417+
CREATE OR REPLACE FUNCTION uuid_ns_oid RETURNS string SONAME 'libudf_uuid.so';
418+
CREATE OR REPLACE FUNCTION uuid_ns_x500 RETURNS string SONAME 'libudf_uuid.so';
419+
CREATE OR REPLACE FUNCTION uuid_is_valid RETURNS integer SONAME 'libudf_uuid.so';
420+
CREATE OR REPLACE FUNCTION uuid_to_bin RETURNS string SONAME 'libudf_uuid.so';
421+
CREATE OR REPLACE FUNCTION uuid_from_bin RETURNS string SONAME 'libudf_uuid.so';
276422
-- `bin_to_uuid` and 'uuid_from_bin' are aliases
277-
CREATE FUNCTION bin_to_uuid RETURNS string SONAME 'libudf_uuid.so';
423+
CREATE OR REPLACE FUNCTION bin_to_uuid RETURNS string SONAME 'libudf_uuid.so';
278424
```
279425

280426
Note that Windows `.dll`s are built but have not been tested - please open an
@@ -305,14 +451,14 @@ docker build . --tag mdb-udf-suite-img
305451
# run it in the background
306452
docker run --rm -d \
307453
-e MARIADB_ROOT_PASSWORD=example \
308-
--name mdb_udf_suite \
454+
--name mdb-udf-suite \
309455
mdb-udf-suite-img
310456

311457
# Enter a SQL shell
312-
docker exec -it mdb_udf_suite mariadb -pexample
458+
docker exec -it mdb-udf-suite mariadb -pexample
313459

314460
# Stop the server when done
315-
docker stop mdb_udf_suite
461+
docker stop mdb-udf-suite
316462
```
317463

318464
The UDFs can then be loaded using the `CREATE FUNCTION` statements above.

0 commit comments

Comments
 (0)