ArtifactDB · LTLA · Oct 22, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/README.md b/README.md
@@ -74,6 +74,12 @@ It will identify all metadata files with the specified `base` names (i.e., `A.js
 SewerRat will skip any problematic files that cannot be indexed due to, e.g., invalid JSON, insufficient permissions.
 The causes of any failures are reported in the `comments` array in the HTTP response.
 
+On success, the metadata files in the specified directory will be incorporated into the SQLite index.
+We can then [search on the contents of these files](#querying-the-index) or [fetch the contents of any file](#fetching-file-contents) in the registered directory.
+
+### Indexing in detail
+
+As mentioned above, SewerRat will recurse through the specified directory to find metadata files with the listed `base` names.
 Subdirectories with names starting with `.` are skipped during the recursive walk, so any metadata files therein will be ignored.
 This is generally a sensible choice as these directories usually do not contain any interesting (scientific) information. 
 If any such subdirectory is relevant, a user can force SewerRat to include it in the index by passing its path directly as `path`.
@@ -86,8 +92,21 @@ All file information (e.g., modification time, owner) is taken from the link tar
 SewerRat effectively treats the symbolic link as a proxy for the target file.
 If the directory contains symbolic links to other directories, these will not be recursively traversed.
 
-On success, the metadata files in the specified directory will be incorporated into the SQLite index.
-We can then [search on the contents of these files](#querying-the-index) or [fetch the contents of any file](#fetching-file-contents) in the registered directory.
+Each identified metadata document is parsed as JSON and converted into tokens.
+For strings, we use an adaptation of the [FTS5 Unicode61 tokenizer](https://www.sqlite.org/fts5.html#unicode61_tokenizer) to break each string into tokens,
+i.e., strings are split into tokens at any character that is not a Unicode letter/number or a dash.
+For numbers and booleans, the string representation of the value is tokenized.
+All tokens are stored in the index, associated with the JSON object properties in which it was found,
+e.g., the value `"Chris"` is associated with the properties `"b.c"` in the document below.
+
+```json
+{
+    "a": "Aaron",
+    "b": {
+        "c": "Chris"
+    }
+}
+```
 
 ### Automatic updates
 
@@ -176,10 +195,8 @@ The nature of the search depends on the value of `type`:
 - For `"text"`, SewerRat searches on the text (i.e., any string property) in the metadata file.
   The search clause should contain the following additional properties:
   - `text`, the search string.
-    We use an adaptation of the [FTS5 Unicode61 tokenizer](https://www.sqlite.org/fts5.html#unicode61_tokenizer) to process all strings in the metadata files, 
-    i.e., strings are split into tokens at any character that is not a Unicode letter/number or a dash.
-    The same process is applied to the string in `text`.
-    All tokens in `text` must match to a token in the metadata file in order for that file to be considered a match.
+    The tokenization process described [above](#indexing-in-detail) is applied to this string to create tokens.
+    All tokens in `text` must be present in the metadata file in order for that file to be considered a match.
   - (optional) `field`, the name of the metadata property to be matched.
     Matches to tokens are only considered within the named property.
     Properties of nested objects can be specified via `.`-delimited names, e.g., `authors.first`.

diff --git a/database.go b/database.go
@@ -251,6 +251,36 @@ func tokenizeMetadata(parsed interface{}, path string, pid int64, field string,
             failures = append(failures, tokfails...)
         }
 
+    case json.Number: 
+        // Just treat this as a string for simplicity. This should be fine for integers,
+        // but it does result in somewhat unnecessary tokenization for floating-point
+        // numbers. There's no real way around it, though, as the queries are always
+        // tokenized, so you wouldn't be able to find an exact match anyway.
+        tokens, err := tokenizer.Tokenize(string(v))
+        if err != nil {
+            return []string{ fmt.Sprintf("failed to tokenize %q in %q; %v", v, path, err) }
+        }
+
+        for _, t := range tokens {
+            _, err := prepped.Token.Exec(t)
+            if err != nil {
+                failures = append(failures, fmt.Sprintf("failed to insert token %q from %q; %v", t, path, err))
+                continue
+            }
+
+            _, err = prepped.Field.Exec(field)
+            if err != nil {
+                failures = append(failures, fmt.Sprintf("failed to insert field %q from %q; %v", field, path, err))
+                continue
+            }
+
+            _, err = prepped.Link.Exec(pid, field, t)
+            if err != nil {
+                failures = append(failures, fmt.Sprintf("failed to insert link for field %q to token %q from %q; %v", field, t, path, err))
+                continue
+            }
+        }
+
     case string:
         tokens, err := tokenizer.Tokenize(v)
         if err != nil {
@@ -276,6 +306,32 @@ func tokenizeMetadata(parsed interface{}, path string, pid int64, field string,
                 continue
             }
         }
+
+    case bool:
+        var t string
+        if v {
+            t = "true"
+        } else {
+            t = "false"
+        }
+
+        _, err := prepped.Token.Exec(t)
+        if err != nil {
+            failures = append(failures, fmt.Sprintf("failed to insert token %q from %q; %v", t, path, err))
+            break
+        }
+
+        _, err = prepped.Field.Exec(field)
+        if err != nil {
+            failures = append(failures, fmt.Sprintf("failed to insert field %q from %q; %v", field, path, err))
+            break
+        }
+
+        _, err = prepped.Link.Exec(pid, field, t)
+        if err != nil {
+            failures = append(failures, fmt.Sprintf("failed to insert link for field %q to token %q from %q; %v", field, t, path, err))
+            break
+        }
     }
 
     return failures

diff --git a/database_test.go b/database_test.go
@@ -1254,6 +1254,42 @@ func TestQueryTokens(t *testing.T) {
         }
     })
 
+    t.Run("search on numbers", func(t *testing.T) {
+        res, err := queryTokens(dbconn, &searchClause{ Type: "text", Text: "5", Field: "bar.cost" }, nil, 0)
+        if err != nil {
+            t.Fatalf(err.Error())
+        }
+        if !equalPathArrays(extractSortedPaths(res), []string{ "metadata.json" }, to_add) {
+            t.Fatalf("search results are not as expected %v", res)
+        }
+
+        res, err = queryTokens(dbconn, &searchClause{ Type: "text", Text: "10495" }, nil, 0)
+        if err != nil {
+            t.Fatalf(err.Error())
+        }
+        if !equalPathArrays(extractSortedPaths(res), []string{ "stuff/metadata.json" }, to_add) {
+            t.Fatalf("search results are not as expected %v", res)
+        }
+    })
+
+    t.Run("search on booleans", func(t *testing.T) {
+        res, err := queryTokens(dbconn, &searchClause{ Type: "text", Text: "false" }, nil, 0)
+        if err != nil {
+            t.Fatalf(err.Error())
+        }
+        if !equalPathArrays(extractSortedPaths(res), []string{ "whee/other.json" }, to_add) {
+            t.Fatalf("search results are not as expected %v", res)
+        }
+
+        res, err = queryTokens(dbconn, &searchClause{ Type: "text", Text: "true", Field: "category.iyashikei" }, nil, 0)
+        if err != nil {
+            t.Fatalf(err.Error())
+        }
+        if !equalPathArrays(extractSortedPaths(res), []string{ "whee/other.json" }, to_add) {
+            t.Fatalf("search results are not as expected %v", res)
+        }
+    })
+
     t.Run("not (simple)", func(t *testing.T) {
         res, err := queryTokens(
             dbconn, 

diff --git a/load.go b/load.go
@@ -6,6 +6,7 @@ import (
     "fmt"
     "encoding/json"
     "os"
+    "bytes"
 )
 
 type loadedMetadata struct {
@@ -27,7 +28,9 @@ func loadMetadata(f string, info fs.FileInfo) *loadedMetadata {
     }
 
     var vals interface{}
-    err = json.Unmarshal(raw, &vals)
+    dec := json.NewDecoder(bytes.NewReader(raw))
+    dec.UseNumber() // preserve numbers as strings for tokenization.
+    err = dec.Decode(&vals)
     if err != nil {
         output.Failure = fmt.Errorf("failed to parse %q; %w", f, err)
         return output

diff --git a/load_test.go b/load_test.go
@@ -5,6 +5,7 @@ import (
     "os"
     "path/filepath"
     "time"
+    "encoding/json"
     "strings"
 )
 
@@ -45,8 +46,8 @@ func TestLoadMetadata(t *testing.T) {
             t.Fatal("unexpected parsed object")
         }
 
-        target, ok := found.(float64)
-        if !ok || target != 1 {
+        target, ok := found.(json.Number)
+        if !ok || target != "1" {
             t.Fatal("unexpected parsed object")
         }
     })

diff --git a/utils_test.go b/utils_test.go
@@ -55,7 +55,7 @@ func mockDirectory(path string) error {
         return fmt.Errorf("failed to create the mock directory; %w", err)
     }
 
-    err = os.WriteFile(filepath.Join(path, "metadata.json"), []byte(`{ "foo": "Aaron had a little lamb", "bar": { "breed": [ "merino", "border leicester" ], "type": "lamb", "number": 1 } }`), 0600)
+    err = os.WriteFile(filepath.Join(path, "metadata.json"), []byte(`{ "foo": "Aaron had a little lamb", "bar": { "breed": [ "merino", "border leicester" ], "type": "lamb", "cost": 1.5 } }`), 0600)
     if err != nil {
         return fmt.Errorf("failed to mock a metadata file; %w", err)
     }
@@ -82,7 +82,7 @@ func mockDirectory(path string) error {
         return fmt.Errorf("failed to mock a subdirectory; %w", err)
     }
 
-    err = os.WriteFile(filepath.Join(sub2, "other.json"), []byte(`{ "favorites": [ "Yuru Camp", "Non non biyori" ] }`), 0600)
+    err = os.WriteFile(filepath.Join(sub2, "other.json"), []byte(`{ "favorites": [ "Yuru Camp", "Non non biyori" ], "category": { "iyashikei": true, "nsfw": false } }`), 0600)
     if err != nil {
         return fmt.Errorf("failed to mock a metadata file; %w", err)
     }