Skip to content

Commit 9546c6c

Browse files
author
Gal Ben David
committed
Rust rewrite, replaced msufsort with libsais, many bug fixes and performance improvements
1 parent 63830bc commit 9546c6c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+8564
-15740
lines changed

.github/workflows/build.yml

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,49 @@
11
name: Build
22
on: [push, pull_request]
3-
43
jobs:
5-
build:
4+
lint:
65
if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
7-
runs-on: ubuntu-20.04
6+
runs-on: ubuntu-latest
7+
steps:
8+
- name: Checkout
9+
uses: actions/checkout@v2
10+
- name: Install latest rust
11+
uses: actions-rs/toolchain@v1
12+
with:
13+
toolchain: stable
14+
override: true
15+
components: clippy
16+
- name: Lint with clippy
17+
uses: actions-rs/cargo@v1
18+
with:
19+
command: clippy
20+
args: --all-targets --all-features
21+
test:
22+
runs-on: ${{ matrix.os }}
23+
needs: lint
824
strategy:
25+
fail-fast: false
926
matrix:
10-
python-version: [3.6, 3.7, 3.8, 3.9, pypy3]
27+
python-version: ['3.7', '3.8', '3.9', '3.10']
28+
os: [ubuntu-latest , macos-latest, windows-latest]
1129
steps:
1230
- name: Checkout
1331
uses: actions/checkout@v2
1432
- name: Set up Python ${{ matrix.python-version }}
1533
uses: actions/setup-python@v2
1634
with:
1735
python-version: ${{ matrix.python-version }}
18-
- name: Install Ubuntu packages
19-
run: >-
20-
sudo apt install libidn2-dev;
36+
- name: Run image
37+
uses: abatilo/actions-poetry@v2.0.0
38+
- name: Install Rust
39+
uses: actions-rs/toolchain@v1
40+
with:
41+
profile: minimal
42+
toolchain: stable
43+
override: true
44+
- name: Install dependencies
45+
run: poetry install
46+
- name: Build Python package
47+
run: poetry run maturin develop
2148
- name: Test
22-
run: >-
23-
python setup.py test
49+
run: poetry run pytest -Werror tests

.github/workflows/deploy.yml

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,43 @@
11
name: Deploy
22
on:
33
release:
4-
types: [published]
5-
4+
types: [released]
65
jobs:
76
deploy:
8-
runs-on: ubuntu-20.04
7+
runs-on: ${{ matrix.os }}
8+
strategy:
9+
fail-fast: false
10+
matrix:
11+
python-version: ['3.7', '3.8', '3.9', '3.10']
12+
os: [ubuntu-latest, macos-latest, windows-latest]
913
steps:
1014
- name: Checkout
1115
uses: actions/checkout@v2
12-
- name: Set up Python 3.8
16+
- name: Set up Python ${{ matrix.python-version }}
1317
uses: actions/setup-python@v2
1418
with:
15-
python-version: 3.8
16-
- name: Build a source tarball
17-
run: >-
18-
python -m pip install --user --upgrade setuptools;
19-
python setup.py sdist;
20-
- name: Publish distribution 📦 to PyPI
21-
uses: pypa/gh-action-pypi-publish@master
19+
python-version: ${{ matrix.python-version }}
20+
- name: Install Rust
21+
uses: actions-rs/toolchain@v1
2222
with:
23-
password: ${{ secrets.pypi_password }}
23+
profile: minimal
24+
toolchain: stable
25+
override: true
26+
- uses: messense/maturin-action@v1
27+
if: runner.os != 'Windows'
28+
with:
29+
maturin-version: latest
30+
command: publish
31+
manylinux: 2_24
32+
args: --username __token__ --no-sdist --interpreter python${{ matrix.python-version }}
33+
env:
34+
MATURIN_PASSWORD: ${{ secrets.pypi_password }}
35+
- uses: messense/maturin-action@v1
36+
if: runner.os == 'Windows'
37+
with:
38+
maturin-version: latest
39+
command: publish
40+
manylinux: 2_24
41+
args: --username __token__ --no-sdist --interpreter python
42+
env:
43+
MATURIN_PASSWORD: ${{ secrets.pypi_password }}

.gitignore

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ parts/
2020
sdist/
2121
var/
2222
wheels/
23-
pip-wheel-metadata/
2423
share/python-wheels/
2524
*.egg-info/
2625
.installed.cfg
@@ -50,6 +49,7 @@ coverage.xml
5049
*.py,cover
5150
.hypothesis/
5251
.pytest_cache/
52+
cover/
5353

5454
# Translations
5555
*.mo
@@ -72,6 +72,7 @@ instance/
7272
docs/_build/
7373

7474
# PyBuilder
75+
.pybuilder/
7576
target/
7677

7778
# Jupyter Notebook
@@ -82,7 +83,9 @@ profile_default/
8283
ipython_config.py
8384

8485
# pyenv
85-
.python-version
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
8689

8790
# pipenv
8891
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -128,6 +131,22 @@ dmypy.json
128131
# Pyre type checker
129132
.pyre/
130133

131-
*.cppimporthash
132-
.rendered.*
133-
.vscode
134+
# pytype static type analyzer
135+
.pytype/
136+
137+
# Cython debug symbols
138+
cython_debug/
139+
.gitignore
140+
.gitignore
141+
142+
# Generated by Cargo
143+
# will have compiled files and executables
144+
debug/
145+
target/
146+
147+
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
148+
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
149+
Cargo.lock
150+
151+
# These are backup files generated by rustfmt
152+
**/*.rs.bk

Cargo.toml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
[package]
2+
name = "pysubstringsearch"
3+
version = "0.5.0"
4+
authors = ["Gal Ben David <gal@intsights.com>"]
5+
edition = "2021"
6+
description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array"
7+
readme = "README.md"
8+
repository = "https://github.com/intsights/pysubstringsearch"
9+
homepage = "https://github.com/intsights/pysubstringsearch"
10+
license = "MIT"
11+
keywords = [
12+
"substring",
13+
"pattern",
14+
"search",
15+
"suffix",
16+
"array",
17+
"rust",
18+
"pyo3"
19+
]
20+
21+
[package.metadata.maturin]
22+
requires-python = ">=3.6"
23+
classifier = [
24+
"License :: OSI Approved :: MIT License",
25+
"Operating System :: MacOS",
26+
"Operating System :: Microsoft",
27+
"Operating System :: POSIX :: Linux",
28+
"Programming Language :: Python :: 3.7",
29+
"Programming Language :: Python :: 3.8",
30+
"Programming Language :: Python :: 3.9",
31+
"Programming Language :: Python :: 3.10",
32+
"Programming Language :: Rust",
33+
]
34+
35+
[lib]
36+
name = "pysubstringsearch"
37+
crate-type = ["cdylib"]
38+
39+
[dependencies]
40+
ahash = "0.7"
41+
bstr = "0.2"
42+
byteorder = "1"
43+
memchr = "2"
44+
parking_lot = "0.11"
45+
rayon = "1"
46+
47+
[dependencies.pyo3]
48+
version = "0.15.1"
49+
features = ["extension-module"]
50+
51+
[build-dependencies]
52+
cc = { version = "1.0", features = ["parallel"] }
53+
54+
[profile.release]
55+
lto = true
56+
panic = "abort"

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2019 Gal Ben David
3+
Copyright (c) 2021 Gal Ben David
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

MANIFEST.in

Lines changed: 0 additions & 4 deletions
This file was deleted.

README.md

Lines changed: 19 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
<img src="https://raw.githubusercontent.com/Intsights/PySubstringSearch/master/images/logo.png" alt="Logo">
44
</a>
55
<h3 align="center">
6-
Python library for fast substring/pattern search written in C++ leveraging Suffix Array Algorithm
6+
A Python library written in Rust that searches for substrings quickly using a Suffix Array
77
</h3>
88
</p>
99

1010
![license](https://img.shields.io/badge/MIT-License-blue)
11-
![Python](https://img.shields.io/badge/Python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%20pypy3-blue)
11+
![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)
1212
![Build](https://github.com/Intsights/PySubstringSearch/workflows/Build/badge.svg)
1313
[![PyPi](https://img.shields.io/pypi/v/PySubstringSearch.svg)](https://pypi.org/project/PySubstringSearch/)
1414

@@ -19,8 +19,7 @@
1919
- [Built With](#built-with)
2020
- [Performance](#performance)
2121
- [500MB File](#500mb-file)
22-
- [6000MB File](#6000mb-file)
23-
- [Prerequisites](#prerequisites)
22+
- [7500MB File](#7500mb-file)
2423
- [Installation](#installation)
2524
- [Usage](#usage)
2625
- [License](#license)
@@ -29,48 +28,35 @@
2928

3029
## About The Project
3130

32-
PySubstringSearch is a library intended for searching over an index file for substring patterns. The library is written in C++ to achieve speed and efficiency. The library also uses [Msufsort](https://github.com/michaelmaniscalco/msufsort) suffix array construction library for string indexing. The created index consists of the original text and a 32bit suffix array structs. The library relies on a proprietary container protocol to hold the original text along with the index in chunks of 512mb to evade the limitation of the Suffix Array Construction implementation.
31+
PySubstringSearch is a library designed to search over an index file for substring patterns. In order to achieve speed and efficiency, the library is written in Rust. For string indexing, the library uses [libsais](https://github.com/IlyaGrebnov/libsais) suffix array construction library. The index created consists of the original text and a 32bit suffix array struct. To get around the limitations of the Suffix Array Construction implementation, the library uses a proprietary container protocol to hold the original text and index in chunks of 512MB.
3332

34-
The module implements multiple methods.
35-
- `search` - search concurrently for a substring existed in different entries within the index file. As the index file getting bigger with multiple inner chunks, the concurrency effect increases.
36-
- `count_entries` - return the number of entries in the index file consisting of the substring.
37-
- `count_occurrences` - return the number of occurrences of the substring in all the entries. If the substring exists multiple times in the same entry, each occurrence will be counted.
33+
The module implements a method for searching.
34+
- `search` - Find different entries with the same substring concurrently. Concurrency increases as the index file grows in size with multiple inner chunks.
3835

3936

4037
### Built With
4138

42-
* [Msufsort](https://github.com/michaelmaniscalco/msufsort)
39+
* [libsais](https://github.com/IlyaGrebnov/libsais)
4340

4441

4542
### Performance
4643

4744
#### 500MB File
4845
| Library | Function | Time | #Results | Improvement Factor |
4946
| ------------- | ------------- | ------------- | ------------- | ------------- |
50-
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_one', '500mb').run().as_string.split('\n') | 148ms | 2367 | 1.0x |
51-
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_one') | 1.28ms | 2367 | 115.6x |
52-
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '500mb').run().as_string.split('\n') | 116ms | 159 | 1.0x |
53-
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 228µs | 159 | 508.7x |
47+
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '500mb').run().as_string.split('\n') | 47.2ms | 5943 | 1.0x |
48+
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 497µs | 5943 | 95x |
49+
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '500mb').run().as_string.split('\n') | 44.7ms | 159 | 1.0x |
50+
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 14.9µs | 159 | 3000x |
5451

55-
#### 6000MB File
52+
#### 7500MB File
5653
| Library | Function | Time | #Results | Improvement Factor |
5754
| ------------- | ------------- | ------------- | ------------- | ------------- |
58-
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_one', '6000mb').run().as_string.split('\n') | 2.4s | 59538 | 1.0x |
59-
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_one') | 15.4ms | 59538 | 155.8x |
60-
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '6000mb').run().as_string.split('\n') | 1.5s | 7266 | 1.0x |
61-
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 1.97ms | 7266 | 761.4x |
55+
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '6000mb').run().as_string.split('\n') | 900ms | 62834 | 1.0x |
56+
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 10.1ms | 62834 | 89.1x |
57+
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '6000mb').run().as_string.split('\n') | 820ms | 0 | 1.0x |
58+
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 200µs | 0 | 4100x |
6259

63-
### Prerequisites
64-
65-
In order to compile this package you should have GCC & Python development package installed.
66-
* Fedora
67-
```sh
68-
sudo dnf install python3-devel gcc-c++
69-
```
70-
* Ubuntu 18.04
71-
```sh
72-
sudo apt install python3-dev g++-9
73-
```
7460

7561
### Installation
7662

@@ -79,7 +65,6 @@ pip3 install PySubstringSearch
7965
```
8066

8167

82-
8368
## Usage
8469

8570
Create an index
@@ -97,6 +82,9 @@ writer.add_entry('some short string')
9782
writer.add_entry('another but now a longer string')
9883
writer.add_entry('more text to add')
9984

85+
# adding entries from file lines
86+
writer.add_entries_from_file_lines('input_file.txt')
87+
10088
# making sure the data is dumped to the file
10189
writer.finalize()
10290
```
@@ -117,16 +105,6 @@ reader.search('short')
117105
# lookup for a substring
118106
reader.search('string')
119107
>>> ['some short string', 'another but now a longer string']
120-
121-
# count the number of occurrences
122-
# ['some short string', 'another string now, but a longer string']
123-
reader.count_occurences('string')
124-
>>> 3
125-
126-
# count the number of entries
127-
# ['some short string', 'another string now, but a longer string']
128-
reader.count_occurences('string')
129-
>>> 2
130108
```
131109

132110

build.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
fn main() {
2+
println!("cargo:rerun-if-changed=libsais.c");
3+
4+
let src = [
5+
"src/libsais/libsais.c",
6+
];
7+
let mut builder = cc::Build::new();
8+
let build = builder
9+
.files(src.iter());
10+
build.compile("libsais");
11+
}

0 commit comments

Comments
 (0)