kaitai-io · KOLANICH · Dec 9, 2021
diff --git a/LICENSES/Unlicense.md b/LICENSES/Unlicense.md
@@ -0,0 +1,27 @@
+Unlicense (Public Domain)
+============================
+
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to &lt;<https://unlicense.org/>&gt;
diff --git a/executable/intel_hyperscan/ReadMe.md b/executable/intel_hyperscan/ReadMe.md
@@ -0,0 +1,28 @@
+<!--
+SPDX-FileCopyrightText: KOLANICH, 2021
+SPDX-License-Identifier: Unlicense
+-->
+
+## Intel Hyperscan
+
+Hyperscan is a library for fast matching regular expressions against binary buffers/streams in large scale.
+
+It serializes precompiled regexps into own binary format. [1](https://github.com/intel/hs/blob/64a995bf445d86b74eb0f375624ffc85682eadfe/src/db.c#L62-L110) [2](https://github.com/intel/hs/blob/64a995bf445d86b74eb0f375624ffc85682eadfe/doc/dev-reference/serialization.rst).
+
+
+In this dir I have created a demo app extracting HDD model names from text streams and detecting their vendors/brands.
+
+The regexps have been taken from https://github.com/KOLANICH-ML/HDDModelDecoder.py .
+
+The app first generates a "DB", then matches it against the buffer and displays the results for self-check, then generates the serialized representations of "DB"s and stores them into files.
+
+In this dir only "simple" format is present. [Chimera format](https://github.com/intel/hs/blob/64a995bf445d86b74eb0f375624ffc85682eadfe/chimera/ch_db.h) goes to another dir.
+
+```
+Version: 5.4.0 Features: AVX2
+vectored: Mode: VECTORED
+block: Mode: BLOCK
+stream_large: Mode: STREAM
+```
+
+Source: own work.
diff --git a/executable/intel_hyperscan/block.hyperscan_simple b/executable/intel_hyperscan/block.hyperscan_simple
diff --git a/executable/intel_hyperscan/block.hyperscan_simple.license b/executable/intel_hyperscan/block.hyperscan_simple.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2021 KOLANICH, 2021
+SPDX-License-Identifier: Unlicense
diff --git a/executable/intel_hyperscan/generate.py b/executable/intel_hyperscan/generate.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+import typing
+from pathlib import Path
+from pprint import pprint
+
+import hyperscan as hs
+from hyperscan import Database
+
+__license__ = "Unlicense"
+__copyright__ = "KOLANICH, 2021"
+
+thisDir = Path(__file__).parent
+
+# Regular expressions have been taken from https://github.com/KOLANICH-ML/HDDModelDecoder.py
+rxs = {
+	"HGST": b"([HW])([UDTECMS])([HSCEATNP])(\\d{2}|5C)(\\d{2})(\\d{2})([PDVKA])([L795S])(16|18|36|38|F2|F4|AT|SA|A3|A6|E6|N6|SS|42|52|S6)([0-486M0L])([0-5])",
+	"Samsung": b"(HD|HE|HM|HN-M|HS|SP)(\\d{2,3})(HI|HJ|GJ|HX|IX|JX|JI|HA|GA|GB|GI|HB|THB|HC|II|IJ|JB|TJB|JJ|JQ|UJQ|LD|LI|LJ|MBB|RHF|RJF|SI|SJ|UI|UJ|VHF|VJF|WI|\\d[NSC])",
+	"WD": b"(WD)(\\dN|\\d{3}M|\\d{2,})([ABCDEFGHJKLMNPSTX][94BDKLRSWYAFZ0123CEJGHMPUV])?([26ABCDEFGHJKLMRSVWPTYZ1U])([RABCDEFGKSTYVWXZ])"
+}
+
+modes = {"vectored": hs.HS_MODE_VECTORED, "block": hs.HS_MODE_BLOCK, "stream_large": hs.HS_MODE_STREAM | hs.HS_MODE_SOM_HORIZON_LARGE}
+
+
+def prepareDatabase(rxs: typing.Dict[str, bytes], mode: int) -> (Database, typing.List[str]):
+	flags = hs.HS_FLAG_SOM_LEFTMOST | hs.HS_FLAG_ALLOWEMPTY | hs.HS_FLAG_DOTALL | hs.HS_FLAG_MULTILINE
+	xs = list(rxs.values())
+	ks = list(rxs.keys())
+	fgs = [flags] * len(ks)
+
+	db = hs.Database(mode=mode)
+	db.compile(expressions=xs, ids=list(range(len(ks))), flags=fgs)
+	return db, ks
+
+
+def testDb(db: Database, keys: typing.List[str], target: bytes) -> typing.Dict[str, str]:
+	res = {}
+
+	def matchesHandler(iD, start, stop, flags, ctx):
+		s = slice(start, stop)
+		model = target[s].decode("utf-8")
+		vendor = keys[iD]
+		res[model] = vendor
+
+	db.scan(target, matchesHandler)
+	return res
+
+
+def main() -> None:
+	db, keys = prepareDatabase(rxs, modes["block"])
+	inputStr = b"dcsgdfw HDN724040ALE640 HDN724040ALE640 SP1614N fafafsfa WD2500AVJS vkjsbvhfjs"
+	pprint(testDb(db, keys, inputStr))
+
+	for n, m in modes.items():
+		fn = thisDir / (n + ".hyperscan_simple")
+		db, keys = prepareDatabase(rxs, m)
+		data = hs.dumpb(db)
+		print(n, "(", hex(len(data)), ")", ":", db.info().decode("utf8"))
+		fn.write_bytes(data)
+
+
+if __name__ == "__main__":
+	main()
diff --git a/executable/intel_hyperscan/stream_large.hyperscan_simple b/executable/intel_hyperscan/stream_large.hyperscan_simple
diff --git a/executable/intel_hyperscan/stream_large.hyperscan_simple.license b/executable/intel_hyperscan/stream_large.hyperscan_simple.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2021 KOLANICH, 2021
+SPDX-License-Identifier: Unlicense
diff --git a/executable/intel_hyperscan/vectored.hyperscan_simple b/executable/intel_hyperscan/vectored.hyperscan_simple
diff --git a/executable/intel_hyperscan/vectored.hyperscan_simple.license b/executable/intel_hyperscan/vectored.hyperscan_simple.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2021 KOLANICH, 2021
+SPDX-License-Identifier: Unlicense
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		SPDX-FileCopyrightText: 2021 KOLANICH, 2021
		SPDX-License-Identifier: Unlicense