Skip to content

Commit

Permalink
Issue codeplea#1 - update your utility
Browse files Browse the repository at this point in the history
* Updates the code to use PSR-2 formatting (https://www.php-fig.org/psr/psr-2/). This requires some changes to formatting of class and method names.
* Adds composer to the project. Currently only pulls in `phpunit` as a development requirement.
* Adds `phpunit` tests created from the `example.php` code along with testing the various exceptions that can be thrown.
* Some code polish based on static analysis.
  • Loading branch information
traack_lcruz committed Oct 1, 2018
1 parent 64d696b commit 34cb759
Show file tree
Hide file tree
Showing 10 changed files with 1,730 additions and 76 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/vendor/
151 changes: 151 additions & 0 deletions AhoCorasick.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
<?php

/*
* ahocorasick - fast string searching in php
*
* Copyright (c) 2017-2018 Lewis Van Winkle
*
* http://CodePlea.com
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgement in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
*/



class AhoCorasick
{

private $nodes = [ [] ];

private $final = 0;

/**
* Add a keyword to search for.
*
* @param string $needle
* @throws Exception
*/
public function addNeedle(string $needle)
{
if ($this->final) {
throw new Exception('Cannot add word to finalized ahocorasick.');
}

$nodes = &$this->nodes;
$n = 0;

$needleLength = strlen($needle);

for ($i = 0; $i < $needleLength; ++$i) {
$c = $needle[$i];

if (!isset($nodes[$n][$c])) {
$nodes[$n][$c] = count($nodes);
$nodes[] = [];
}
$n = $nodes[$n][$c];
}

$nodes[$n][0][] = $needle;
}

/**
* Create the structure needed to search text for the given keywords.
* Once you call this, you cannot add additional keywords via addNeedle().
*/
public function finalize()
{
$nodes = &$this->nodes;
$queue = [];

foreach ($nodes[0] as $j => $_) {
$nodes[$nodes[0][$j]][1] = 0;
$queue[] = $nodes[0][$j];
}

while (count($queue)) {
$r = $queue[0];
$queue = array_slice($queue, 1);

foreach ($nodes[$r] as $j => $_) {
if ($j === 0 || $j === 1) {
continue;
}
$v = $nodes[$r][1];
$u = $nodes[$r][$j];
while ($v > 0 && !isset($nodes[$v][$j])) {
$v = $nodes[$v][1];
}
$nodes[$u][1] = $nodes[$v][$j] ?? $v;
if (isset($nodes[$nodes[$u][1]][0])) {
if (!isset($nodes[$u][0])) {
$nodes[$u][0] = [];
}
$nodes[$u][0] = array_merge($nodes[$u][0], $nodes[$nodes[$u][1]][0]);
}
$queue[] = $u;
}
}

$this->final = 1;
}

/**
* Search the text for the given keywords.
*
* @param string $haystack
* @return array
* @throws Exception
*/
public function search(string $haystack):array
{
if (!$this->final) {
throw new Exception('Must call finalize() before search.');
}

$nodes = &$this->nodes;
$found = [];
$n = 0;

$haystackLength = strlen($haystack);

for ($i = 0; $i < $haystackLength; ++$i) {
$c = $haystack[$i];

while (!array_key_exists($c, $nodes[$n]) && $n) {
$n = $nodes[$n][1];
if ($n === null) {
die();
}
}

if (isset($nodes[$n][$c])) {
$n = $nodes[$n][$c];
}

if (isset($nodes[$n][0])) {
$z = $nodes[$n][0];
foreach ($z as $w) {
$found[] = [$w, $i - strlen($w) + 1];
}
}
}

return $found;
}
}
21 changes: 14 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ search text.
Create, add keywords, and `finalize()`:

```php
require('ahocorasick.php');
require('AhoCorasick.php');

$ac = new ahocorasick();
$ac = new AhoCorasick();

$ac->add_needle('art');
$ac->add_needle('cart');
$ac->add_needle('ted');
$ac->addNeedle('art');
$ac->addNeedle('cart');
$ac->addNeedle('ted');

$ac->finalize();

Expand Down Expand Up @@ -97,10 +97,17 @@ time: 0.054709911346436
```

Note: the regex solutions are actually slightly broken. They won't work if you
**Note:** the regex solutions are actually slightly broken. They won't work if you
have a keyword that is a prefix or suffix of another. But hey, who really uses
regex when it's not slightly broken?

Also keep in mind that building the search tree (the `add_needle()` and
Also keep in mind that building the search tree (the `addNeedle()` and
`finalize()` calls) takes time. So you'll get the best speed-up if you're
reusing the same keywords and calling `search()` many times.

# Running tests

```$php
$ composer install
$ ./vendor/bin/phpunit
```
103 changes: 44 additions & 59 deletions benchmark.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,57 +4,48 @@
/* It compares our ahocorasick method with regex and strpos. */


require('ahocorasick.php');
require('benchmark_setup.php'); /* keywords and text */
require 'AhoCorasick.php';
require 'benchmark_setup.php'; /* keywords and text */

$loops = 10;

print("Loaded " . count($needles) . " keywords to search on a text of " .
strlen($haystack) . " characters.\n");
print('Loaded ' . count($needles) . ' keywords to search on a text of ' .
strlen($haystack) . " characters.\n");

print("\nSearching with strpos...\n");

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
foreach($needles as $n) {
$k = 0;
while(($k = strpos($haystack, $n, $k)) !== FALSE) {
$found[] = array($n, $k);
++$k;
$found = array();
foreach ($needles as $n) {
$k = 0;
while (($k = strpos($haystack, $n, $k)) !== false) {
$found[] = array($n, $k);
++$k;
}
}
}
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");
print('time: ' . ($et - $st) . "\n");
$found_strpos = $found;






print("\nSearching with preg_match...\n");
//Note, this actually sucks and misses cases where one needle is a prefix or
//suffix of another.
$regex = '/' . implode('|', $needles) . '/';

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$k = 0;
while(preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) {
$found[] = $m[0];
$k = $m[0][1] + 1;
}
$found = array();
$k = 0;
while (preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) {
$found[] = $m[0];
$k = $m[0][1] + 1;
}
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");
//print_r($found);




print('time: ' . ($et - $st) . "\n");


print("\nSearching with preg_match_all...\n");
Expand All @@ -64,57 +55,51 @@

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$k = 0;
preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE);
$found = $found[0];
$found = array();
$k = 0;
preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE);
$found = $found[0];
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");



print('time: ' . ($et - $st) . "\n");


print("\nSearching with aho corasick...\n");
$ac = new ahocorasick();
foreach ($needles as $n) $ac->add_needle($n);
$ac = new AhoCorasick();
foreach ($needles as $n) {
$ac->addNeedle($n);
}
$ac->finalize();

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$found = $ac->search($haystack);
$found = $ac->search($haystack);
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");




print('time: ' . ($et - $st) . "\n");


//Check that the answers match.
//First sort the arrays.
$comp = function($a, $b) {return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]);};
$comp = function ($a, $b) {
return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]);
};
usort($found, $comp);
usort($found_strpos, $comp);

if ($found_strpos !== $found) {
print("ERROR - Aho Corasick got the wrong result.\n");
print("ERROR - Aho Corasick got the wrong result.\n");

print('strpos size: ' . count($found_strpos) . "\n");
print('aho corasick size: ' . count($found) . "\n");

print("strpos size: " . count($found_strpos) . "\n");
print("aho corasick size: " . count($found) . "\n");
$numberFound = count($found);

for ($i = 0; $i < count($found); ++$i) {
if ($found_strpos[$i] !== $found[$i]) {
print("Mismatch $i\n");
print_r($found_strpos[$i]);
print_r($found[$i]);
for ($i = 0; $i < $numberFound; ++$i) {
if ($found_strpos[$i] !== $found[$i]) {
print("Mismatch $i\n");
print_r($found_strpos[$i]);
print_r($found[$i]);
}
}
}
}




4 changes: 2 additions & 2 deletions benchmark_setup.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php

$needles = array('abandonment', 'abashed', 'abashments', 'abduction',
$needles = ['abandonment', 'abashed', 'abashments', 'abduction',
'aberrant', 'abiding', 'abidingly', 'abjures', 'ablution', 'abolishes',
'abominably', 'aborted', 'abrasion', 'abridgment', 'abscesses', 'absconds',
'absences', 'absinthe', 'absolves', 'absorbingly', 'abundant', 'abused',
Expand Down Expand Up @@ -532,7 +532,7 @@
'remonstrates', 'remorse', 'removes', 'remunerated', 'rendering',
'renditions', 'reneged', 'renominate', 'renovators', 'reorders',
'repatriates', 'repave', 'repaying', 'repeatedly', 'repertoires', 'replied',
'reprehend', 'reprieves', 'reprimanded');
'reprehend', 'reprieves', 'reprimanded'];

$haystack = 'unscathed grampus antinuclear avenged waste oversee doggies spumes
senators balk gooseberries grilles respelled ceramists outlaid maladroitly
Expand Down
15 changes: 15 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"name": "codeplea/ahocorasickphp",
"description": "Aho-Corasick multi-keyword string searching library in PHP.",
"type": "library",
"config": {
"sort-packages": true
},
"require": {
"php": ">=7.0"
},
"require-dev": {
"phpunit/phpunit": "^7.3"
},
"license": "zlib"
}
Loading

0 comments on commit 34cb759

Please sign in to comment.