From fa68328058373c296053dea7bc5d57f7a3ce4564 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Mon, 21 Jul 2025 14:15:45 +0200
Subject: [PATCH 01/47] search class

---
 activity_browser/bwutils/search/__init__.py   |   1 +
 .../bwutils/search/searchengine.py            | 627 ++++++++++++++++++
 2 files changed, 628 insertions(+)
 create mode 100644 activity_browser/bwutils/search/__init__.py
 create mode 100644 activity_browser/bwutils/search/searchengine.py

diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py
new file mode 100644
index 000000000..f9fde759c
--- /dev/null
+++ b/activity_browser/bwutils/search/__init__.py
@@ -0,0 +1 @@
+from searchengine import SearchEngine
\ No newline at end of file
diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
new file mode 100644
index 000000000..84a9c6333
--- /dev/null
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -0,0 +1,627 @@
+from itertools import permutations, chain
+import itertools
+import functools
+from collections import Counter, OrderedDict, Iterable
+import pandas as pd
+import numpy as np
+import re
+
+
+class SearchEngine:
+    """
+    A Search Engine class, takes a dataframe and makes it searchable.
+
+    A search requires a string, and will return a list of unique identifiers in the dataframe.
+    There are three options for search:
+        SearchEngine.literal_search(): searches for exact matches of the search query
+        SearchEngine.fuzzy_search(): searches for approximate matches of search query, sorted by relevance
+        SearchEngine.search(): combines both of the above, literal matches are returned first, next all fuzzy results, buth subsets sorted by relevance
+    It is recommended to always use searchEngine.search(), but the other options are there.
+
+    Initialization takes:
+        df: Dataframe that needs to be searchable.
+        identifier_name: values in this column will be returned as search results, all values in this column need to be unique.
+        searchable_columns: these columns need to be searchable, if none are given, all columns will be made searchable.
+
+    Updating data is possible as well:
+        add_identifier(): adds this identifier to the searchable data
+        remove_identifier(): removes this identifier from the searchable data
+        change_identifier(): changes this identifier (wrapper for remove_identifier and add_identifier)
+
+    """
+
+    def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: list = []):
+
+        # compile regex patterns for cleaning
+        self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]")  # for replacing with empty string
+        self.SPACE_PATTERN = re.compile(r"[-−:;]")  # for replacing with space
+        self.ONE_SPACE_PATTERN = re.compile(r"\s+")  # for replacing multiple white space with 1 space
+
+        self.q = 2  # character lenght of q grams
+        self.base_weight = 10  # base weigthing for sorting results
+
+        assert identifier_name in df.columns  # make sure identifier col exist
+        assert df[identifier_name].nunique() == df.shape[0]  # make sure identifiers are all unique
+        self.identifier_name = identifier_name
+
+        # ensure columns given actually exist
+        # always ensure "identifier" is present
+        if searchable_columns == []:
+            # if no list is given, assume all columns are searchable
+            self.columns = list(df.columns)
+        else:
+            # create subset of columns to be searchable, discard rest
+            self.columns = [col for col in searchable_columns if col in df.columns]
+            if self.identifier_name not in self.columns:  # keep identifier col
+                self.columns.append(self.identifier_name)
+            df = df[self.columns]
+        # set the identifier column as index
+        df = df.set_index(self.identifier_name, drop=False)
+
+        # convert all data to str
+        df = df.astype(str)
+
+        # find the self.identifier_name column index and store as int
+        self.identifier_column = self.columns.index(self.identifier_name)
+
+        # store all searchable columns except the identifier
+        self.regular_columns = [i for i in range(len(self.columns)) if i != self.identifier_column]
+
+        # initialize search index dicts and update df
+        self.identifier_to_word = {}
+        self.word_to_identifier = {}
+        self.word_to_q_grams = {}
+        self.q_gram_to_word = {}
+        self.df = pd.DataFrame()
+
+        self.update_index(df)
+
+    #   +++ Utility functions
+
+    def update_index(self, update_df: pd.DataFrame) -> None:
+        """Update search index dicts and the df."""
+
+        def update_dict(update_me: dict, new: dict) -> dict:
+            """Update a dict of counters with new dict of counters."""
+            for dict_key, _counter in new.items():
+                if dict_key in update_me:
+                    update_me[dict_key].update(_counter)
+                else:
+                    update_me[dict_key] = _counter
+            return update_me
+
+        # identifier to word and df
+        i2w, update_df = self.words_in_df(update_df)
+        self.identifier_to_word = update_dict(self.identifier_to_word, i2w)
+        self.df = pd.concat([self.df, update_df])
+
+        # word to identifier
+        w2i = self.reverse_dict_many_to_one(i2w)
+        self.word_to_identifier = update_dict(self.word_to_identifier, w2i)
+
+        # word to qgram
+        w2q = self.list_to_q_grams(w2i.keys())
+        self.word_to_q_grams = update_dict(self.word_to_q_grams, w2q)
+
+        # gram to word
+        q2w = self.reverse_dict_many_to_one(w2q)
+        self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w)
+
+    def clean_text(self, text: str):
+        """Clean a string so it doesn't contain weird characters or multiple spaces etc."""
+        text = self.SUB_PATTERN.sub("", text.lower())
+        text = self.SPACE_PATTERN.sub(" ", text)
+        text = self.ONE_SPACE_PATTERN.sub(" ", text).strip()
+        return text
+
+    def text_to_positional_q_gram(self, text: str) -> list:
+        """Return a positional list of qgrams for the given string.
+
+        https://en.wikipedia.org/wiki/N-gram
+        q-grams are n-grams on character level.
+
+        qgrams of "word" would be "wo", "or" and "rd" for q=2
+
+        Note: these are technically positional q grams, but we don't use their
+        positions currently
+        """
+        q = self.q
+
+        # just return a single-item list if the text is equal or shorter than q
+        # else, generate qgrams
+        if len(text) <= q:
+            return [text]
+        else:
+            return [text[i:i + q] for i in range(len(text) - q + 1)]
+
+    def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
+        """Return a dict of {identifier: word} for df."""
+
+        df = df if any(df) else self.df
+        return_df = df.copy()
+
+        df = df.iloc[:, self.regular_columns]
+        identifier_word_dict = {}
+        col = []
+
+        for row in df.itertuples(index=True):
+            line = self.clean_text(" ".join(row[1:]))
+            col.append(line)
+            identifier_word_dict[row[0]] = Counter(line.split(" "))
+
+        return_df["query_col"] = col
+
+        return identifier_word_dict, return_df
+
+    def reverse_dict_many_to_one(self, dictionary: dict) -> dict:
+        """Reverse a dictionary of Counter objects."""
+        reverse = {}
+        for identifier, counter_object in dictionary.items():
+            for countable, count in counter_object.items():
+                if countable not in reverse:
+                    reverse[countable] = Counter()
+                reverse[countable][identifier] += count
+        return reverse
+
+    def list_to_q_grams(self, word_list: Iterable) -> dict:
+        """Convert a list of unique words to a dict with Counter objects.
+
+
+        q_gram_dict = {
+            "word": Counter(
+                "wo": 1
+                "or": 1
+                "rd": 1
+                )
+            }
+
+        """
+        q_gram_dict = {}
+
+        for word in word_list:
+            q_gram_dict[word] = Counter(self.text_to_positional_q_gram(word))
+
+        return q_gram_dict
+
+    #   +++ Changes to searchable data
+
+    def add_identifier(self, identifier, data: dict) -> None:
+        """Add this identifier to the search index.
+
+        identifier is expected to be a unique identifier that has not been used before
+        data is expected to be a dict of column names and data
+        """
+
+        # make sure we don't add an identifier that already exists
+        assert identifier not in self.df.index.to_list()
+
+        df_cols = self.columns
+
+        # drop fields that are not in self.df
+        drop = [col for col in data if col not in df_cols]
+        for field in drop:
+            del data[field]
+
+        # add empty field for missing data
+        for col in df_cols:
+            if col not in data:
+                data[col] = ""
+
+        # convert to df
+        new_df = pd.DataFrame(data, index=[identifier])
+        new_df = new_df.astype(str)
+
+        # update the search index data
+        self.update_index(new_df)
+
+    def remove_identifier(self, identifier) -> None:
+        """Remove this identifier from self.df and the search index.
+        """
+
+        # remove from df
+        self.df.drop(identifier, inplace=True)
+
+        # find words that may need to be removed
+        words = self.identifier_to_word[identifier]
+        for word in words:
+            if len(self.word_to_identifier[word]) == 1:
+                # this word is only found in this identifier,
+                # remove the word and check for q grams
+                del self.word_to_identifier[word]
+
+                q_grams = self.word_to_q_grams[word]
+                for q_gram in q_grams:
+                    if len(self.q_gram_to_word[q_gram]) == 1:
+                        # this q_gram is only used in this word,
+                        #  remove it
+                        del self.q_gram_to_word[q_gram]
+
+                del self.word_to_q_grams[word]
+            else:
+                # remove the identifier from the
+                del self.word_to_identifier[word][identifier]
+        # finally, remove the identifier
+        del self.identifier_to_word[identifier]
+
+    def change_identifier(self, identifier, data: dict) -> None:
+        """Change this identifier.
+
+        identifier is expected to be a unique identifier that is in use
+        data is expected to be a dict of column names and data that change
+
+        only changed data needs to be supplied
+        """
+        assert identifier in self.df.index.to_list()
+
+        # get existing data
+        update_data = dict(self.df.loc[identifier].values)
+
+        # overwrite new data where relevant
+        for field, value in data.items():
+            update_data[field] = value
+
+        # remove the entry
+        self.remove_identifier(identifier)
+
+        # add entry with new data
+        self.add_identifier(identifier, update_data)
+
+    #   +++ Search
+
+    def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: list = None) -> pd.Series:
+        """Filter the search columns of a dataframe on a pattern.
+
+        Returns a mask (true/false) pd.Series with matching items."""
+
+        search_columns = search_columns if search_columns else self.columns
+
+        mask = functools.reduce(
+            np.logical_or,
+            [
+                df[col].apply(lambda x: pattern in x.lower())
+                for col in search_columns
+            ],
+        )
+        return mask
+
+    def literal_search(self, text):
+        """Do literal search of the text in all original columns that were given."""
+
+        identifiers = self.filter_dataframe(self.df, text)
+        df = self.df.loc[identifiers]
+        identifiers = df.index.to_list()
+
+        return identifiers
+
+    def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: int = 1000) -> int:
+        """Calculate the Optimal String Alignment (OSA) edit distance between two strings, return edit distance.
+
+        Has additional cutoff variable, if cutoff is higher than 0 and if the words have
+        a larger difference in length, immediately return a large number
+
+        OSA is a restricted form of the Damerau–Levenshtein distance.
+        https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
+
+        The edit distance is how many operations (insert, delete, substitute or transpose a character) need to happen to convert one string to another.
+        insert and delete are obvious operations, but substitute and transpose are explained:
+            substitute: replace one character with another: e.g. word1=cat word2=cab, t->b substitution is 1 operation
+            transpose: swap the places of two adjacent characters with each other: e.g. word1=coal word2=cola al -> la transposition is 1 operation
+
+        The minimum amount of operations (OSA edit distance) is returned.
+        """
+
+        if word1 == word2:
+            # if the strings are the same, immediately return 0
+            return 0
+
+        len1, len2 = len(word1), len(word2)
+
+        if 0 < cutoff < abs(len1 - len2):
+            # if the length difference between 2 words is over the cutoff,
+            # just return instead of calculating the edit distance
+            return cutoff_return
+
+        if len1 == 0 or len2 == 0:
+            # in case (at least) one of the strings is empty,
+            # return the lenth of the longest string
+            return max(len1, len2)
+
+        # Initialize matrix
+        distance = [[0] * len2 for _ in range(len1)]
+
+        # calculate shortest edit distance
+        for i in range(len1):
+            for j in range(len2):
+                cost = 0 if word1[i] == word2[j] else 1
+
+                # Compute distances for insertion, deletion and substitution
+                insertion = distance[i][j - 1] + 1 if j > 0 else i + 1
+                deletion = distance[i - 1][j] + 1 if i > 0 else j + 1
+                substitution = distance[i - 1][j - 1] + cost if i > 0 and j > 0 else max(i, j) + cost
+
+                distance[i][j] = min(deletion, insertion, substitution)
+
+                # Compute transposition when relevant
+                if i > 0 and j > 0 and word1[i] == word2[j - 1] and word1[i - 1] == word2[j]:
+                    transposition = distance[i - 2][j - 2] + 1 if i > 1 and j > 1 else max(i, j) - 1
+                    distance[i][j] = min(distance[i][j], transposition)
+
+        return distance[len1 - 1][len2 - 1]
+
+    def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame:
+        """Find which of the given q_grams exist in self.q_gram_to_word,
+        return a sorted dataframe of best matching words.
+        """
+        n_q_grams = len(q_grams)
+
+        matches = {}
+
+        # find words that match our qgrams
+        for q_gram in q_grams:
+            if words := self.q_gram_to_word.get(q_gram, False):
+                # q_gram exists in our search index
+                for word in words:
+                    matches[word] = matches.get(word, 0) + words[word]
+
+        # if we find no results, return an empty dataframe
+        if len(matches) == 0:
+            return pd.DataFrame({"word": [], "matches": []})
+
+        # otherwise, create a dataframe and
+        # reduce search results to most relevant results
+        matches = {"word": matches.keys(), "matches": matches.values()}
+        matches = pd.DataFrame(matches)
+        max_q = max(matches["matches"])
+
+        # determine how many results we want to keep based on how good our results are
+        min_q = max(max_q * 0.5,  # have at least half of qgrams of best match or...
+                    max(n_q_grams * 0.5,  # if more, at least half the qgrams in the query word?
+                        1))  # okay just do 1 qgram if there are no more in the word
+
+        matches = matches[matches["matches"] >= min_q]
+        matches = matches.sort_values(by="matches", ascending=False)
+        matches = matches.reset_index(drop=True)
+
+        return matches.iloc[:min(len(matches), 2500), :]  # return at most this many results
+
+    def spell_check(self, text: str) -> OrderedDict:
+        """Create an OrderedDict of each word in the text (space separated)
+        with as values possible alternatives.
+
+        Alternatives are first found with q-grams, then refined with string edit distance
+
+        We rank alternative words based on 1) edit distance 2) how often a word is used in an entry
+        If too many results are found, we only keep edit distance 1,
+        if we want more results, we keep with longer edit distance up to ...
+
+
+        word_results = OrderedDict(
+            "word": [word, work]
+            )
+
+        """
+
+        word_results = OrderedDict()
+
+        matches_goal = 3  # ideally we have at least this many alternatives
+
+        always_accept_this = 1  # values of this or lower always accepted
+        never_accept_this = 4  # values this or over always rejected
+
+        # make list of unique words
+        words = OrderedDict()
+        for word in text.split(" "):
+            words[word] = False
+        words = words.keys()
+
+        words = [self.clean_text(word) for word in words]
+
+        for word in words:
+
+            # first, find possible matches quickly
+            q_grams = self.text_to_positional_q_gram(word)
+            possible_matches = self.find_q_gram_matches(q_grams)
+
+            matches = []
+            other_matches = {}
+
+            # now, refine with levenshtein
+            for row in possible_matches.itertuples():
+
+                edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this)
+
+                if edit_distance == 0:
+                    continue  # we are looking for alternatives only, not the exact word
+                elif edit_distance <= always_accept_this:
+                    matches.append(row[1])
+                elif edit_distance < never_accept_this:
+                    if other_matches.get(edit_distance):
+                        other_matches[edit_distance].append(row[1])
+                    else:
+                        other_matches[edit_distance] = [row[1]]
+
+            # if we have fewer matches than goal, add more 'less good' matches
+            if len(matches) < matches_goal:
+                for i in range(always_accept_this + 1, never_accept_this):
+                    # iteratively increate worse matches
+                    if new := other_matches.get(i):
+                        matches = matches + new
+
+                        if len(matches) >= matches_goal:
+                            break
+
+            word_results[word] = matches
+
+        return word_results
+
+    def build_queries(self, query_text) -> list:
+        """Make all possible subsets of words in the query, including alternative words."""
+        query_text = self.spell_check(query_text)
+
+        # find all combinations of the query words as given
+        queries = list(query_text.keys())
+        subsets = list(chain.from_iterable(
+            (itertools.combinations(
+                queries, r) for r in range(1, len(queries) + 1))))
+        all_queries = []
+
+        for combination in subsets:
+            # add the 'default' option
+            all_queries.append(combination)
+            # now add all options with all alternatives
+            for i, word in enumerate(combination):
+                for alternative in query_text.get(word, []):
+                    alternative_combination = list(combination)
+                    alternative_combination[i] = alternative
+                    all_queries.append(alternative_combination)
+
+        return all_queries
+
+    def weigh_identifiers(self, identifiers: Iterable, weight: int, weighted_ids: Counter) -> Counter:
+        """Add weights to identifier counter for these identifiers"""
+
+        for identifier in identifiers:
+            weighted_ids[identifier] += int(weight)
+
+        return weighted_ids
+
+    def search_size_1(self, queries: list, original_words: list, orig_word_weight=11, exact_word_weight=1) -> dict:
+        """Return a dict of {query_word: Counter(identifier)}.
+
+        queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word
+        original words: a list of words actually searched for (not including spellechecked)
+
+        orig_word_weight: additional weight to add to original words
+        exact_word_weight: additional weight to add to exact word matches (as opposed to be 'in' str)
+
+        First, we find all matching words, creating a dict of words in 'queries' as keys and words matching that query word as list of values
+        Next, we convert this to identifiers and add weights:
+            Weight will be increased if matching 'orig_word_weight' or 'exact_word_weight'
+        """
+        matches = {}
+        # add each word in search index if query_word in word
+        for word in self.word_to_identifier.keys():
+            for query in queries:
+                # query is list/tuple of len 1
+                query_word = query[0]  # only use the word
+                if query_word in word:
+                    words = matches.get(query_word, [])
+                    words.extend([word])
+                    matches[query_word] = words
+
+        # now convert matched words to matched identifiers
+        matched_identifiers = {}
+        for word, matching_words in matches.items():
+            for matched_word in matching_words:
+                weight = self.base_weight
+
+                id_counter = matched_identifiers.get(word, Counter())
+
+                # add the word n times, where n is the weight, original search word is weighted higher than alternatives
+                if matched_word in original_words:
+                    weight += orig_word_weight  # increase weight for original word
+                if matched_word == word:
+                    weight += exact_word_weight  # increase weight for exact matching word
+
+                id_counter = self.weigh_identifiers(self.word_to_identifier[matched_word], weight, id_counter)
+                matched_identifiers[word] = id_counter
+
+        return matched_identifiers
+
+    def fuzzy_search(self, text: str) -> list:
+        """Search the dataframe, finding approximate matches and return a list of identifiers,
+        ranked by how well each identifier matches the search text.
+        """
+
+        queries = self.build_queries(text)
+
+        # make list of unique original words
+        orig_words = OrderedDict()
+        for word in text.split(" "):
+            orig_words[word] = False
+        orig_words = orig_words.keys()
+        orig_words = [self.clean_text(word) for word in orig_words]
+
+        # order the queries by the amount of words they contain
+        # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space
+        queries_by_size = OrderedDict()
+        longest_query = max([len(q) for q in queries])
+        for query_len in range(1, longest_query + 1):
+            queries_by_size[query_len] = [q for q in queries if len(q) == query_len]
+
+        # first handle queries of length 1
+        query_to_identifier = self.search_size_1(queries_by_size[1], orig_words)
+
+        # get all results into a df, we rank further later
+        all_identifiers = set()
+        for id_list in [id_list for id_list in query_to_identifier.values()]:
+            all_identifiers.update(id_list)
+        search_df = self.df.loc[list(all_identifiers)]
+
+        # now, we search for combinations of query words and get only those identifiers
+        # we then reduce de search_df further for only those matching identifiers
+        # we then search the permutations of that set of words
+        for q_len, query_set in queries_by_size.items():
+            if q_len == 1:
+                # we already did these above
+                continue
+            for query in query_set:
+
+                # get the intersection of all identifiers
+                # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query
+                # this ensures we only ever search data where ALL items occur to reduce search-space
+                query_identifiers = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
+                                                       query_to_identifier.get(q_word, False)])
+                if len(query_identifiers) == 0:
+                    # there is no match for this combination of query words, skip
+                    break
+
+                # we now add these identifiers to a counter for this query name,
+                query_name = " ".join(query)
+
+                weight = self.base_weight * q_len
+                query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter())
+
+                # now search for all permutations of this query combined with a space
+                query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)]
+                for query_perm in permutations(query):
+                    mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"])
+                    new_df = query_df.loc[mask].reset_index(drop=True)
+                    if len(new_df) == 0:
+                        # there is no match for this permutation of words, skip
+                        continue
+                    new_ids = list(new_df[self.identifier_name])
+                    # we weigh a combination of words and next to each other even higher than just the words separately
+                    weight = self.base_weight * q_len * q_len
+                    query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight,
+                                                                             query_to_identifier[query_name])
+
+        # now finally, move to one object sorted list by highest score
+        all_identifiers = Counter()
+        for identifiers in query_to_identifier.values():
+            all_identifiers += identifiers
+
+        # now sort on highest weights and make list type
+        sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
+        return sorted_identifiers
+
+    def search(self, text) -> list:
+        """Search the dataframe on this text, return a sorted list of identifiers"""
+
+        literal_identifiers = self.literal_search(text)
+        fuzzy_identifiers = self.fuzzy_search(text)
+
+        ordered_literal_identifiers = []
+        other_identifiers = []
+
+        # add all fuzzy identifiers to one of two lists, depending on whether they were found in literal search or not
+        # this guarantees we put the literal matches on top, but still sort within this group based on fuzzy scores
+        for identifier in fuzzy_identifiers:
+            if identifier in literal_identifiers:
+                ordered_literal_identifiers.append(identifier)
+            else:
+                other_identifiers.append(identifier)
+
+        identifiers = ordered_literal_identifiers + other_identifiers
+
+        return identifiers

From 40bf40747f3e3d1b8422f5526f8662f902acf2a8 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 22 Jul 2025 10:59:33 +0200
Subject: [PATCH 02/47] Include correct init file as well

---
 activity_browser/bwutils/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/activity_browser/bwutils/__init__.py b/activity_browser/bwutils/__init__.py
index bac9adccd..4bb75c292 100644
--- a/activity_browser/bwutils/__init__.py
+++ b/activity_browser/bwutils/__init__.py
@@ -12,6 +12,7 @@
 from .montecarlo import MonteCarloLCA
 from .multilca import MLCA, Contributions
 from .pedigree import PedigreeMatrix
+from .search import SearchEngine
 from .sensitivity_analysis import GlobalSensitivityAnalysis
 from .superstructure import SuperstructureContributions, SuperstructureMLCA
 from .uncertainty import (CFUncertaintyInterface, ExchangeUncertaintyInterface,

From 413ac1ae1bcc958e70628367c64debdfa6904be4 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 23 Jul 2025 15:51:06 +0200
Subject: [PATCH 03/47] Search tests, minor corrections, better documentation

---
 activity_browser/bwutils/search/__init__.py   |   2 +-
 .../bwutils/search/searchengine.py            | 168 ++++++++++------
 tests/test_search.py                          | 181 ++++++++++++++++++
 3 files changed, 289 insertions(+), 62 deletions(-)
 create mode 100644 tests/test_search.py

diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py
index f9fde759c..042045a6b 100644
--- a/activity_browser/bwutils/search/__init__.py
+++ b/activity_browser/bwutils/search/__init__.py
@@ -1 +1 @@
-from searchengine import SearchEngine
\ No newline at end of file
+from .searchengine import SearchEngine
\ No newline at end of file
diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index 84a9c6333..a444e3d19 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -1,7 +1,8 @@
 from itertools import permutations, chain
 import itertools
 import functools
-from collections import Counter, OrderedDict, Iterable
+from collections import Counter, OrderedDict
+from typing import Iterable
 import pandas as pd
 import numpy as np
 import re
@@ -15,7 +16,8 @@ class SearchEngine:
     There are three options for search:
         SearchEngine.literal_search(): searches for exact matches of the search query
         SearchEngine.fuzzy_search(): searches for approximate matches of search query, sorted by relevance
-        SearchEngine.search(): combines both of the above, literal matches are returned first, next all fuzzy results, buth subsets sorted by relevance
+        SearchEngine.search(): combines both of the above, literal matches are returned first, next all fuzzy results,
+        but subsets sorted by relevance.
     It is recommended to always use searchEngine.search(), but the other options are there.
 
     Initialization takes:
@@ -37,11 +39,15 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l
         self.SPACE_PATTERN = re.compile(r"[-−:;]")  # for replacing with space
         self.ONE_SPACE_PATTERN = re.compile(r"\s+")  # for replacing multiple white space with 1 space
 
-        self.q = 2  # character lenght of q grams
-        self.base_weight = 10  # base weigthing for sorting results
+        self.q = 2  # character length of q grams
+        self.base_weight = 10  # base weighting for sorting results
+
+        if identifier_name not in df.columns:  # make sure identifier col exist
+            raise NameError(f"Identifier column {identifier_name} not found in dataframe. Use an existing column name.")
+        if df[identifier_name].nunique() != df.shape[0]:  # make sure identifiers are all unique
+            raise KeyError(
+                f"Identifier column {identifier_name} must only contain unique values. Found {df[identifier_name].nunique()} unique values for length {df.shape[0]}")
 
-        assert identifier_name in df.columns  # make sure identifier col exist
-        assert df[identifier_name].nunique() == df.shape[0]  # make sure identifiers are all unique
         self.identifier_name = identifier_name
 
         # ensure columns given actually exist
@@ -99,11 +105,11 @@ def update_dict(update_me: dict, new: dict) -> dict:
         w2i = self.reverse_dict_many_to_one(i2w)
         self.word_to_identifier = update_dict(self.word_to_identifier, w2i)
 
-        # word to qgram
+        # word to q-gram
         w2q = self.list_to_q_grams(w2i.keys())
         self.word_to_q_grams = update_dict(self.word_to_q_grams, w2q)
 
-        # gram to word
+        # q-gram to word
         q2w = self.reverse_dict_many_to_one(w2q)
         self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w)
 
@@ -115,20 +121,18 @@ def clean_text(self, text: str):
         return text
 
     def text_to_positional_q_gram(self, text: str) -> list:
-        """Return a positional list of qgrams for the given string.
+        """Return a positional list of q-grams for the given string.
 
-        https://en.wikipedia.org/wiki/N-gram
         q-grams are n-grams on character level.
+        q-grams at q=2 of "word" would be "wo", "or" and "rd"
+        https://en.wikipedia.org/wiki/N-gram
 
-        qgrams of "word" would be "wo", "or" and "rd" for q=2
-
-        Note: these are technically positional q grams, but we don't use their
-        positions currently
+        Note: these are technically _positional_ q-grams, but we don't use their positions currently.
         """
         q = self.q
 
         # just return a single-item list if the text is equal or shorter than q
-        # else, generate qgrams
+        # else, generate q-grams
         if len(text) <= q:
             return [text]
         else:
@@ -145,10 +149,9 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
         col = []
 
         for row in df.itertuples(index=True):
-            line = self.clean_text(" ".join(row[1:]))
+            line = self.clean_text(" | ".join(row[1:]))
             col.append(line)
             identifier_word_dict[row[0]] = Counter(line.split(" "))
-
         return_df["query_col"] = col
 
         return identifier_word_dict, return_df
@@ -166,6 +169,7 @@ def reverse_dict_many_to_one(self, dictionary: dict) -> dict:
     def list_to_q_grams(self, word_list: Iterable) -> dict:
         """Convert a list of unique words to a dict with Counter objects.
 
+        Number will be the occurrences of that q-gram in that word.
 
         q_gram_dict = {
             "word": Counter(
@@ -191,9 +195,13 @@ def add_identifier(self, identifier, data: dict) -> None:
         identifier is expected to be a unique identifier that has not been used before
         data is expected to be a dict of column names and data
         """
-
-        # make sure we don't add an identifier that already exists
-        assert identifier not in self.df.index.to_list()
+        # make sure we the identifier does not yet exist
+        if identifier in self.df.index.to_list():
+            raise Exception(
+                f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.")
+        if data[self.identifier_name] != identifier:
+            raise Exception(
+                f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.")
 
         df_cols = self.columns
 
@@ -217,6 +225,10 @@ def add_identifier(self, identifier, data: dict) -> None:
     def remove_identifier(self, identifier) -> None:
         """Remove this identifier from self.df and the search index.
         """
+        # make sure the identifier exists
+        if identifier not in self.df.index.to_list():
+            raise Exception(
+                f"Identifier '{identifier}' does not exist in the search data, cannot remove identifier that do not exist.")
 
         # remove from df
         self.df.drop(identifier, inplace=True)
@@ -238,7 +250,7 @@ def remove_identifier(self, identifier) -> None:
 
                 del self.word_to_q_grams[word]
             else:
-                # remove the identifier from the
+                # remove the identifier from the dict
                 del self.word_to_identifier[word][identifier]
         # finally, remove the identifier
         del self.identifier_to_word[identifier]
@@ -251,10 +263,17 @@ def change_identifier(self, identifier, data: dict) -> None:
 
         only changed data needs to be supplied
         """
-        assert identifier in self.df.index.to_list()
+        # make sure the identifier exists
+        if identifier not in self.df.index.to_list():
+            raise Exception(
+                f"Identifier '{identifier}' does not exist in the search data, use an existing identifier or use the add_identifier function.")
+        if data[self.identifier_name] != identifier:
+            raise Exception(
+                f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.")
 
         # get existing data
-        update_data = dict(self.df.loc[identifier].values)
+        update_data = {col: self.df.loc[identifier, col] for col in self.df.columns}
+        del update_data["query_col"]
 
         # overwrite new data where relevant
         for field, value in data.items():
@@ -297,35 +316,39 @@ def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: i
         """Calculate the Optimal String Alignment (OSA) edit distance between two strings, return edit distance.
 
         Has additional cutoff variable, if cutoff is higher than 0 and if the words have
-        a larger difference in length, immediately return a large number
+        a larger edit distance, return a large number (note: cutoff <= edit_dist, not cutoff < edit_dist)
 
         OSA is a restricted form of the Damerau–Levenshtein distance.
         https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
 
         The edit distance is how many operations (insert, delete, substitute or transpose a character) need to happen to convert one string to another.
         insert and delete are obvious operations, but substitute and transpose are explained:
-            substitute: replace one character with another: e.g. word1=cat word2=cab, t->b substitution is 1 operation
-            transpose: swap the places of two adjacent characters with each other: e.g. word1=coal word2=cola al -> la transposition is 1 operation
+            substitute: replace one character with another: e.g. word1='cat' word2='cab', 't'->'b' substitution is 1 operation
+            transpose: swap the places of two adjacent characters with each other: e.g. word1='coal' word2='cola' 'al' -> 'la' transposition is 1 operation
 
-        The minimum amount of operations (OSA edit distance) is returned.
+        The minimum amount of edit operations (OSA edit distance) is returned.
         """
-
         if word1 == word2:
             # if the strings are the same, immediately return 0
             return 0
 
         len1, len2 = len(word1), len(word2)
 
-        if 0 < cutoff < abs(len1 - len2):
+        if 0 < cutoff <= abs(len1 - len2):
             # if the length difference between 2 words is over the cutoff,
             # just return instead of calculating the edit distance
             return cutoff_return
 
         if len1 == 0 or len2 == 0:
             # in case (at least) one of the strings is empty,
-            # return the lenth of the longest string
+            # return the length of the longest string
             return max(len1, len2)
 
+        if len1 < len2 and cutoff > 0:
+            # make sure word1 is always the longest (required for early stopping with cutoff)
+            word1, word2 = word2, word1
+            len1, len2 = len2, len1
+
         # Initialize matrix
         distance = [[0] * len2 for _ in range(len1)]
 
@@ -346,9 +369,12 @@ def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: i
                     transposition = distance[i - 2][j - 2] + 1 if i > 1 and j > 1 else max(i, j) - 1
                     distance[i][j] = min(distance[i][j], transposition)
 
-        return distance[len1 - 1][len2 - 1]
+            # stop early if we surpass cutoff
+            if 0 < cutoff <= distance[i][j]:
+                return cutoff_return
+        return distance[i][j]
 
-    def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame:
+    def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
         """Find which of the given q_grams exist in self.q_gram_to_word,
         return a sorted dataframe of best matching words.
         """
@@ -356,7 +382,7 @@ def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame:
 
         matches = {}
 
-        # find words that match our qgrams
+        # find words that match our q-grams
         for q_gram in q_grams:
             if words := self.q_gram_to_word.get(q_gram, False):
                 # q_gram exists in our search index
@@ -371,12 +397,12 @@ def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame:
         # reduce search results to most relevant results
         matches = {"word": matches.keys(), "matches": matches.values()}
         matches = pd.DataFrame(matches)
-        max_q = max(matches["matches"])
+        max_q = max(matches["matches"])  # this has the most matching q-grams
 
         # determine how many results we want to keep based on how good our results are
-        min_q = max(max_q * 0.5,  # have at least half of qgrams of best match or...
-                    max(n_q_grams * 0.5,  # if more, at least half the qgrams in the query word?
-                        1))  # okay just do 1 qgram if there are no more in the word
+        min_q = max(max_q * 0.32,  # have at least a third of q-grams of best match or...
+                    max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
+                        1))  # okay just do 1 q-gram if there are no more in the word
 
         matches = matches[matches["matches"] >= min_q]
         matches = matches.sort_values(by="matches", ascending=False)
@@ -400,11 +426,9 @@ def spell_check(self, text: str) -> OrderedDict:
             )
 
         """
-
         word_results = OrderedDict()
 
         matches_goal = 3  # ideally we have at least this many alternatives
-
         always_accept_this = 1  # values of this or lower always accepted
         never_accept_this = 4  # values this or over always rejected
 
@@ -420,12 +444,12 @@ def spell_check(self, text: str) -> OrderedDict:
 
             # first, find possible matches quickly
             q_grams = self.text_to_positional_q_gram(word)
-            possible_matches = self.find_q_gram_matches(q_grams)
+            possible_matches = self.find_q_gram_matches(set(q_grams))
 
             matches = []
             other_matches = {}
 
-            # now, refine with levenshtein
+            # now, refine with edit distance
             for row in possible_matches.itertuples():
 
                 edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this)
@@ -443,13 +467,11 @@ def spell_check(self, text: str) -> OrderedDict:
             # if we have fewer matches than goal, add more 'less good' matches
             if len(matches) < matches_goal:
                 for i in range(always_accept_this + 1, never_accept_this):
-                    # iteratively increate worse matches
+                    # iteratively increase 'worse' matches so we hit goal of minimum alternatives
                     if new := other_matches.get(i):
                         matches = matches + new
-
                         if len(matches) >= matches_goal:
                             break
-
             word_results[word] = matches
 
         return word_results
@@ -477,19 +499,17 @@ def build_queries(self, query_text) -> list:
 
         return all_queries
 
-    def weigh_identifiers(self, identifiers: Iterable, weight: int, weighted_ids: Counter) -> Counter:
-        """Add weights to identifier counter for these identifiers"""
-
-        for identifier in identifiers:
-            weighted_ids[identifier] += int(weight)
-
+    def weigh_identifiers(self, identifiers: Counter, weight: int, weighted_ids: Counter) -> Counter:
+        """Add weights to identifier counter for these identifiers times how often it occurs in identifier."""
+        for identifier, occurrences in identifiers.items():
+            weighted_ids[identifier] += (weight * occurrences)
         return weighted_ids
 
-    def search_size_1(self, queries: list, original_words: list, orig_word_weight=11, exact_word_weight=1) -> dict:
+    def search_size_1(self, queries: list, original_words: list, orig_word_weight=5, exact_word_weight=1) -> dict:
         """Return a dict of {query_word: Counter(identifier)}.
 
         queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word
-        original words: a list of words actually searched for (not including spellechecked)
+        original words: a list of words actually searched for (not including spellchecked)
 
         orig_word_weight: additional weight to add to original words
         exact_word_weight: additional weight to add to exact word matches (as opposed to be 'in' str)
@@ -514,7 +534,6 @@ def search_size_1(self, queries: list, original_words: list, orig_word_weight=11
         for word, matching_words in matches.items():
             for matched_word in matching_words:
                 weight = self.base_weight
-
                 id_counter = matched_identifiers.get(word, Counter())
 
                 # add the word n times, where n is the weight, original search word is weighted higher than alternatives
@@ -531,6 +550,19 @@ def search_size_1(self, queries: list, original_words: list, orig_word_weight=11
     def fuzzy_search(self, text: str) -> list:
         """Search the dataframe, finding approximate matches and return a list of identifiers,
         ranked by how well each identifier matches the search text.
+
+        1. First, identifiers matching single words (and spell-checked alternatives) are found and weighted.
+        2. If the search term consisted of multiple words, combinations of those words are checked next.
+            2.1 Increasing in size (first two words, then three etc.), we look for identifiers that contain that set of
+            words, these are also weighted, based on the sum of all one-word weights (from first step) and the length
+            of the sequence.
+            2.2 Next, we also look specifically for combinations occurring next to each other. And add more weight like
+            the step above (2.1).
+        We multiply the weighting of step 2 by the sequence length, based on the assumption that finding more search
+        words will be a more relevant result than just finding a single word, and again if they are in the
+        correct order.
+
+        Finally, all found identifiers are sorted on their weight and returned.
         """
 
         queries = self.build_queries(text)
@@ -569,13 +601,24 @@ def fuzzy_search(self, text: str) -> list:
 
                 # get the intersection of all identifiers
                 # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query
-                # this ensures we only ever search data where ALL items occur to reduce search-space
-                query_identifiers = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
-                                                       query_to_identifier.get(q_word, False)])
-                if len(query_identifiers) == 0:
+                # this ensures we only ever search data where ALL items occur to substantially reduce search-space
+                # finally, make this a Counter (with each item=1) so we can properly weigh things later
+                query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
+                                                          query_to_identifier.get(q_word, False)])
+                if len(query_identifier_set) == 0:
                     # there is no match for this combination of query words, skip
                     break
 
+                # now we convert the query identifiers to a Counter of 'occurrence',
+                # where we weigh queries with only original words higher
+                query_identifiers = Counter()
+                for identifier in query_identifier_set:
+                    weight = 0
+                    for query_word in query:
+                        weight += query_to_identifier[query_word][identifier]
+
+                    query_identifiers[identifier] = weight
+
                 # we now add these identifiers to a counter for this query name,
                 query_name = " ".join(query)
 
@@ -590,12 +633,15 @@ def fuzzy_search(self, text: str) -> list:
                     if len(new_df) == 0:
                         # there is no match for this permutation of words, skip
                         continue
-                    new_ids = list(new_df[self.identifier_name])
-                    # we weigh a combination of words and next to each other even higher than just the words separately
-                    weight = self.base_weight * q_len * q_len
+                    new_id_list = new_df[self.identifier_name]
+
+                    new_ids = Counter()
+                    for new_id in new_id_list:
+                        new_ids[new_id] = query_identifiers[new_id]
+
+                    # we weigh a combination of words that is next also to each other even higher than just the words separately
                     query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight,
                                                                              query_to_identifier[query_name])
-
         # now finally, move to one object sorted list by highest score
         all_identifiers = Counter()
         for identifiers in query_to_identifier.values():
diff --git a/tests/test_search.py b/tests/test_search.py
new file mode 100644
index 000000000..231859d28
--- /dev/null
+++ b/tests/test_search.py
@@ -0,0 +1,181 @@
+import pytest
+import pandas as pd
+from activity_browser.bwutils.search import SearchEngine
+
+
+def data_for_test():
+    return pd.DataFrame([
+        ["a", "coal production", "coal"],
+        ["b", "coal production", "something"],
+        ["c", "coal production", "coat"],
+        ["d", "coal hello production", "something"],
+        ["e", "dont find me", "hello world"],
+        ["f", "coat", "another word"],
+        ["g", "coalispartofthisword", "things"],
+        ["h", "coal", "coal"],
+    ],
+    columns = ["id", "col1", "col2"])
+
+
+def test_search_init():
+    """Do initialization tests."""
+    df = data_for_test()
+
+    # init search class with non-existent identifier col and fail
+    with pytest.raises(Exception):
+        _ = SearchEngine(df, identifier_name="non_existent_col_name")
+    # init search class with non-unique identifiers and fail
+    df2 = df.copy()
+    df2.iloc[0, 0] = "b"
+    with pytest.raises(Exception):
+        _ = SearchEngine(df2, identifier_name="id")
+    # init search class correctly
+    se = SearchEngine(df, identifier_name="id")
+
+
+def test_search_base():
+    """Do checks for search ranking."""
+
+    df = data_for_test()
+
+    # init search class and two searches
+    se = SearchEngine(df, identifier_name="id")
+    # do search on specific term
+    assert se.search("coal") == ["a", "h", "c", "b", "d", "g", "f"]
+    # do search on other term
+    assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"]
+
+    # init search class with 1 col searchable
+    se = SearchEngine(df, identifier_name="id", searchable_columns=["col2"])
+    assert se.search("coal") == ["a", "h", "c"]
+
+
+def test_search_add_identifier():
+    """Do tests for adding identifier."""
+    df = data_for_test()
+
+    # create base item to add
+    new_base_item = {
+        "id": "i",
+        "col1": "coal production",
+        "col2": "coal production"
+    }
+
+    # use mismatched identifier and fail
+    se = SearchEngine(df, identifier_name="id")
+    with pytest.raises(Exception):
+        se.add_identifier(identifier="j", data=new_base_item)
+
+    # use existing identifier and fail
+    se = SearchEngine(df, identifier_name="id")
+    wrong_id = new_base_item.copy()
+    wrong_id["id"] = "a"
+    with pytest.raises(Exception):
+        se.add_identifier(identifier="a", data=wrong_id)
+
+    # use column too many (should be removed)
+    se = SearchEngine(df, identifier_name="id")
+    col_more = new_base_item.copy()
+    col_more["col3"] = "word"
+    se.add_identifier(identifier="i", data=col_more)
+    assert "col3" not in se.df.columns
+
+    # use column less (should be filled with empty string)
+    se = SearchEngine(df, identifier_name="id")
+    col_less = new_base_item.copy()
+    del col_less["col2"]
+    se.add_identifier(identifier="i", data=col_less)
+    assert se.df.loc["i", "col2"] == ""
+
+    # do search, add item and verify results are different
+    se = SearchEngine(df, identifier_name="id")
+    assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"]
+    se.add_identifier(identifier="i", data=new_base_item)
+    assert se.search("coal production") == ["i", "a", "c", "b", "d", "h", "f", "g"]
+
+
+def test_search_remove_identifier():
+    """Do tests for removing identifier."""
+    df = data_for_test()
+
+    # use non-existent identifier and fail
+    se = SearchEngine(df, identifier_name="id")
+    with pytest.raises(Exception):
+        se.remove_identifier(identifier="i")
+
+    # do search, remove item and verify results are different
+    se = SearchEngine(df, identifier_name="id")
+    assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"]
+    se.remove_identifier(identifier="a")
+    assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"]
+
+
+def test_search_change_identifier():
+    """Do tests for changing identifier."""
+    df = data_for_test()
+
+    # create base item to add
+    edit_data = {
+        "id": "a",
+        "col1": "cant find me anymore",
+        "col2": "something different"
+    }
+
+    # use non-existent identifier and fail
+    se = SearchEngine(df, identifier_name="id")
+    missing_id = edit_data.copy()
+    missing_id["id"] = "i"
+    with pytest.raises(Exception):
+        se.change_identifier(identifier="i", data=missing_id)
+
+    # use mismatched identifier and fail
+    se = SearchEngine(df, identifier_name="id")
+    wrong_id = edit_data.copy()
+    wrong_id["id"] = "i"
+    with pytest.raises(Exception):
+        se.change_identifier(identifier="a", data=wrong_id)
+
+    # do search, change item and verify results are different
+    se = SearchEngine(df, identifier_name="id")
+    assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"]
+    se.change_identifier(identifier="a", data=edit_data)
+    assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"]
+    # now change the same item partially and verify results are different
+    new_edit_data = {
+        "id": "a",
+        "col1": "coal"
+    }
+    se.change_identifier(identifier="a", data=new_edit_data)
+    assert se.search("coal production") == ["c", "b", "d", "h", "a", "f", "g"]
+
+
+def test_string_distance():
+    """Do tests specifically for string distance function"""
+    df = data_for_test()
+    se = SearchEngine(df, identifier_name="id")
+
+    # same word
+    assert se.osa_distance("coal", "coal") == 0
+    # empty string is length of other word
+    assert se.osa_distance("coal", "") == 4
+
+    # insert
+    assert se.osa_distance("coal", "coa") == 1
+    # delete
+    assert se.osa_distance("coal", "coall") == 1
+    # substitute
+    assert se.osa_distance("coal", "coat") == 1
+    # transpose
+    assert se.osa_distance("coal", "cola") == 1
+
+    # longer edit distance
+    assert se.osa_distance("coal", "chocolate") == 6
+    # reverse order gives same result
+    assert se.osa_distance("coal", "chocolate") == se.osa_distance("chocolate", "coal")
+    # cutoff
+    assert se.osa_distance("coal", "chocolate", cutoff=5, cutoff_return=1000) == 1000
+    assert se.osa_distance("coal", "chocolate", cutoff=6, cutoff_return=1000) == 1000
+    assert se.osa_distance("coal", "chocolate", cutoff=7, cutoff_return=1000) == 6
+    # length cutoff
+    assert se.osa_distance("coal", "coallongword") == 8
+    assert se.osa_distance("coal", "coallongword", cutoff=5, cutoff_return=1000) == 1000

From d01387fa2d0b9b8d4ad5fde805399a8bdfebea72 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 24 Jul 2025 12:47:39 +0200
Subject: [PATCH 04/47] Improve search speed with many results.

---
 .../bwutils/search/searchengine.py            | 45 +++++++++----------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index a444e3d19..d0a722b22 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -2,7 +2,7 @@
 import itertools
 import functools
 from collections import Counter, OrderedDict
-from typing import Iterable
+from typing import Iterable, Optional
 import pandas as pd
 import numpy as np
 import re
@@ -287,13 +287,12 @@ def change_identifier(self, identifier, data: dict) -> None:
 
     #   +++ Search
 
-    def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: list = None) -> pd.Series:
+    def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: Optional[list] = None) -> pd.Series:
         """Filter the search columns of a dataframe on a pattern.
 
         Returns a mask (true/false) pd.Series with matching items."""
 
         search_columns = search_columns if search_columns else self.columns
-
         mask = functools.reduce(
             np.logical_or,
             [
@@ -303,13 +302,15 @@ def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: list
         )
         return mask
 
-    def literal_search(self, text):
+    def literal_search(self, text, df: Optional[pd.DataFrame] = None) -> list:
         """Do literal search of the text in all original columns that were given."""
 
-        identifiers = self.filter_dataframe(self.df, text)
-        df = self.df.loc[identifiers]
-        identifiers = df.index.to_list()
+        if df is None:
+            df = self.df.copy()
 
+        identifiers = self.filter_dataframe(df, text)
+        df = df.loc[identifiers]
+        identifiers = df.index.to_list()
         return identifiers
 
     def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: int = 1000) -> int:
@@ -652,22 +653,20 @@ def fuzzy_search(self, text: str) -> list:
         return sorted_identifiers
 
     def search(self, text) -> list:
-        """Search the dataframe on this text, return a sorted list of identifiers"""
-
-        literal_identifiers = self.literal_search(text)
+        """Search the dataframe on this text, return a sorted list of identifiers."""
         fuzzy_identifiers = self.fuzzy_search(text)
-
-        ordered_literal_identifiers = []
-        other_identifiers = []
-
-        # add all fuzzy identifiers to one of two lists, depending on whether they were found in literal search or not
-        # this guarantees we put the literal matches on top, but still sort within this group based on fuzzy scores
-        for identifier in fuzzy_identifiers:
-            if identifier in literal_identifiers:
-                ordered_literal_identifiers.append(identifier)
-            else:
-                other_identifiers.append(identifier)
-
-        identifiers = ordered_literal_identifiers + other_identifiers
+        if len(fuzzy_identifiers) == 0:
+            return []
+
+        # take the fuzzy search sub-set of data and search it literally
+        df = self.df.loc[fuzzy_identifiers].copy()
+        literal_identifiers = self.literal_search(text, df)
+        if len(literal_identifiers) == 0:
+            return fuzzy_identifiers
+
+        # append any fuzzy identifiers that were not found in the literal search
+        fuzzy_identifiers = [
+            _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
+        identifiers = literal_identifiers + fuzzy_identifiers
 
         return identifiers

From 295995e4cbebdcc548185b64648364c2d03f16d6 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 24 Jul 2025 16:09:05 +0200
Subject: [PATCH 05/47] Add basic logging to SearchEngine

---
 .../bwutils/search/searchengine.py            | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index d0a722b22..8b097cc84 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -2,12 +2,17 @@
 import itertools
 import functools
 from collections import Counter, OrderedDict
+from logging import getLogger
+from time import time
 from typing import Iterable, Optional
 import pandas as pd
 import numpy as np
 import re
 
 
+log = getLogger(__name__)
+
+
 class SearchEngine:
     """
     A Search Engine class, takes a dataframe and makes it searchable.
@@ -33,7 +38,7 @@ class SearchEngine:
     """
 
     def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: list = []):
-
+        t = time()
         # compile regex patterns for cleaning
         self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]")  # for replacing with empty string
         self.SPACE_PATTERN = re.compile(r"[-−:;]")  # for replacing with space
@@ -70,7 +75,7 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l
         # find the self.identifier_name column index and store as int
         self.identifier_column = self.columns.index(self.identifier_name)
 
-        # store all searchable columns except the identifier
+        # store all searchable column indices except the identifier
         self.regular_columns = [i for i in range(len(self.columns)) if i != self.identifier_column]
 
         # initialize search index dicts and update df
@@ -81,6 +86,7 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l
         self.df = pd.DataFrame()
 
         self.update_index(df)
+        log.debug(f"Search engine initialized in {time() - t:.2f} seconds for {len(self.df)} items")
 
     #   +++ Utility functions
 
@@ -652,16 +658,21 @@ def fuzzy_search(self, text: str) -> list:
         sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
         return sorted_identifiers
 
-    def search(self, text) -> list:
+    def search(self, text, col_modifiers: Optional[dict] = None) -> list:
         """Search the dataframe on this text, return a sorted list of identifiers."""
+        t = time()
         fuzzy_identifiers = self.fuzzy_search(text)
         if len(fuzzy_identifiers) == 0:
+            log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
             return []
 
         # take the fuzzy search sub-set of data and search it literally
         df = self.df.loc[fuzzy_identifiers].copy()
+
         literal_identifiers = self.literal_search(text, df)
         if len(literal_identifiers) == 0:
+            log.debug(
+                f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
             return fuzzy_identifiers
 
         # append any fuzzy identifiers that were not found in the literal search
@@ -669,4 +680,6 @@ def search(self, text) -> list:
             _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
         identifiers = literal_identifiers + fuzzy_identifiers
 
+        log.debug(
+            f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
         return identifiers

From 79754ca7659e41ba69f2ce103b0e6ee8662821a1 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 24 Jul 2025 16:24:34 +0200
Subject: [PATCH 06/47] .

---
 activity_browser/bwutils/search/searchengine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index 8b097cc84..86e56f29e 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -658,7 +658,7 @@ def fuzzy_search(self, text: str) -> list:
         sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
         return sorted_identifiers
 
-    def search(self, text, col_modifiers: Optional[dict] = None) -> list:
+    def search(self, text) -> list:
         """Search the dataframe on this text, return a sorted list of identifiers."""
         t = time()
         fuzzy_identifiers = self.fuzzy_search(text)

From 6bd39f71af2367d1fc1bfea115012d75bee95402 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Mon, 28 Jul 2025 13:48:07 +0200
Subject: [PATCH 07/47] Base implementation of metadata specific class

---
 activity_browser/bwutils/__init__.py          |   2 +-
 activity_browser/bwutils/metadata.py          |  14 +-
 activity_browser/bwutils/search/__init__.py   |   2 +-
 .../bwutils/search/searchengine.py            | 185 +++++++++++++++++-
 4 files changed, 197 insertions(+), 6 deletions(-)

diff --git a/activity_browser/bwutils/__init__.py b/activity_browser/bwutils/__init__.py
index 4bb75c292..12b565e61 100644
--- a/activity_browser/bwutils/__init__.py
+++ b/activity_browser/bwutils/__init__.py
@@ -12,7 +12,7 @@
 from .montecarlo import MonteCarloLCA
 from .multilca import MLCA, Contributions
 from .pedigree import PedigreeMatrix
-from .search import SearchEngine
+from .search import SearchEngine, MetaDataSearchEngine
 from .sensitivity_analysis import GlobalSensitivityAnalysis
 from .superstructure import SuperstructureContributions, SuperstructureMLCA
 from .uncertainty import (CFUncertaintyInterface, ExchangeUncertaintyInterface,
diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index 9790e7885..dc7e7328a 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -7,8 +7,6 @@
 from typing import Set
 from logging import getLogger
 
-from playhouse.shortcuts import model_to_dict
-
 import pandas as pd
 
 from qtpy.QtCore import Qt, QObject, Signal, SignalInstance
@@ -17,6 +15,8 @@
 from bw2data.errors import UnknownObject
 from bw2data.backends import sqlite3_lci_db, ActivityDataset
 
+from activity_browser.bwutils.search import MetaDataSearchEngine
+
 from activity_browser import signals
 
 
@@ -190,6 +190,7 @@ def sync(self) -> None:
         con.close()
 
         self.dataframe = self._parse_df(node_df)
+        self.init_search()  # init search index
 
         self.synced.emit()
 
@@ -333,5 +334,14 @@ def _unpacker(self, classifications: list, system: str) -> list:
             system_classifications.append(result)  # result is either "" or the classification
         return system_classifications
 
+    def init_search(self):
+        allowed_cols = [
+            "id", "name", "synonyms", "unit", "key", "database",  # generic
+            "CAS number", "categories",  # biosphere specific
+            "product", "reference product", "classifications", "location", "properties"  # activity specific
+        ]
+
+        MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols)
+
 
 AB_metadata = MetaDataStore()
diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py
index 042045a6b..85e30c9be 100644
--- a/activity_browser/bwutils/search/__init__.py
+++ b/activity_browser/bwutils/search/__init__.py
@@ -1 +1 @@
-from .searchengine import SearchEngine
\ No newline at end of file
+from .searchengine import SearchEngine, MetaDataSearchEngine
\ No newline at end of file
diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index 86e56f29e..f1a32f4a1 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -676,9 +676,190 @@ def search(self, text) -> list:
             return fuzzy_identifiers
 
         # append any fuzzy identifiers that were not found in the literal search
-        fuzzy_identifiers = [
+        remaining_fuzzy_identifiers = [
             _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
-        identifiers = literal_identifiers + fuzzy_identifiers
+        identifiers = literal_identifiers + remaining_fuzzy_identifiers
+
+        log.debug(
+            f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
+        return identifiers
+
+
+class MetaDataSearchEngine(SearchEngine):
+    def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
+        """Overwritten for extra database specific reduction of results.
+        """
+        n_q_grams = len(q_grams)
+
+        matches = {}
+
+        # find words that match our q-grams
+        for q_gram in q_grams:
+            if words := self.q_gram_to_word.get(q_gram, False):
+                # q_gram exists in our search index
+                for word in words:
+                    if isinstance(self.database_ids, set):
+                        # DATABASE SPECIFIC now filter on whether word is in the database
+                        in_db = False
+                        for _id in self.word_to_identifier[word]:
+                            if _id in self.database_ids:
+                                in_db = True
+                                break
+                    else:
+                        in_db = True
+                    if in_db:
+                        matches[word] = matches.get(word, 0) + words[word]
+
+        # if we find no results, return an empty dataframe
+        if len(matches) == 0:
+            return pd.DataFrame({"word": [], "matches": []})
+
+        # otherwise, create a dataframe and
+        # reduce search results to most relevant results
+        matches = {"word": matches.keys(), "matches": matches.values()}
+        matches = pd.DataFrame(matches)
+        max_q = max(matches["matches"])  # this has the most matching q-grams
+
+        # determine how many results we want to keep based on how good our results are
+        min_q = max(max_q * 0.32,  # have at least a third of q-grams of best match or...
+                    max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
+                        1))  # okay just do 1 q-gram if there are no more in the word
+
+        matches = matches[matches["matches"] >= min_q]
+        matches = matches.sort_values(by="matches", ascending=False)
+        matches = matches.reset_index(drop=True)
+
+        return matches.iloc[:min(len(matches), 2500), :]  # return at most this many results
+
+    def fuzzy_search(self, text: str) -> list:
+        """Overwritten for extra database specific reduction of results.
+        """
+        queries = self.build_queries(text)
+
+        # make list of unique original words
+        orig_words = OrderedDict()
+        for word in text.split(" "):
+            orig_words[word] = False
+        orig_words = orig_words.keys()
+        orig_words = [self.clean_text(word) for word in orig_words]
+
+        # order the queries by the amount of words they contain
+        # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space
+        queries_by_size = OrderedDict()
+        longest_query = max([len(q) for q in queries])
+        for query_len in range(1, longest_query + 1):
+            queries_by_size[query_len] = [q for q in queries if len(q) == query_len]
+
+        # first handle queries of length 1
+        query_to_identifier = self.search_size_1(queries_by_size[1], orig_words)
+
+        # DATABASE SPECIFIC ensure all identifiers are in the database
+        if isinstance(self.database_ids, set):
+            new_q2i = {}
+            for word, _ids in query_to_identifier.items():
+                keep = set.intersection(set(_ids.keys()), self.database_ids)
+                new_id_counter = Counter()
+                for _id in keep:
+                    new_id_counter[_id] = _ids[_id]
+                if len(new_id_counter) > 0:
+                    new_q2i[word] = new_id_counter
+            query_to_identifier = new_q2i
+
+        # get all results into a df, we rank further later
+        all_identifiers = set()
+        for id_list in [id_list for id_list in query_to_identifier.values()]:
+            all_identifiers.update(id_list)
+        search_df = self.df.loc[list(all_identifiers)]
+
+        # now, we search for combinations of query words and get only those identifiers
+        # we then reduce de search_df further for only those matching identifiers
+        # we then search the permutations of that set of words
+        for q_len, query_set in queries_by_size.items():
+            if q_len == 1:
+                # we already did these above
+                continue
+            for query in query_set:
+
+                # get the intersection of all identifiers
+                # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query
+                # this ensures we only ever search data where ALL items occur to substantially reduce search-space
+                # finally, make this a Counter (with each item=1) so we can properly weigh things later
+                query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
+                                                          query_to_identifier.get(q_word, False)])
+                if len(query_identifier_set) == 0:
+                    # there is no match for this combination of query words, skip
+                    break
+
+                # now we convert the query identifiers to a Counter of 'occurrence',
+                # where we weigh queries with only original words higher
+                query_identifiers = Counter()
+                for identifier in query_identifier_set:
+                    weight = 0
+                    for query_word in query:
+                        weight += query_to_identifier[query_word][identifier]
+
+                    query_identifiers[identifier] = weight
+
+                # we now add these identifiers to a counter for this query name,
+                query_name = " ".join(query)
+
+                weight = self.base_weight * q_len
+                query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter())
+
+                # now search for all permutations of this query combined with a space
+                query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)]
+                for query_perm in permutations(query):
+                    mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"])
+                    new_df = query_df.loc[mask].reset_index(drop=True)
+                    if len(new_df) == 0:
+                        # there is no match for this permutation of words, skip
+                        continue
+                    new_id_list = new_df[self.identifier_name]
+
+                    new_ids = Counter()
+                    for new_id in new_id_list:
+                        new_ids[new_id] = query_identifiers[new_id]
+
+                    # we weigh a combination of words that is next also to each other even higher than just the words separately
+                    query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight,
+                                                                             query_to_identifier[query_name])
+        # now finally, move to one object sorted list by highest score
+        all_identifiers = Counter()
+        for identifiers in query_to_identifier.values():
+            all_identifiers += identifiers
+
+        # now sort on highest weights and make list type
+        sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
+        return sorted_identifiers
+
+    def search(self, text, database: Optional[str] = None) -> list:
+        """Search the dataframe on this text, return a sorted list of identifiers."""
+        t = time()
+
+        # get the set of ids that is in this database
+        if database is not None:
+            self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
+        else:
+            self.database_ids = None
+
+        fuzzy_identifiers = self.fuzzy_search(text)
+        if len(fuzzy_identifiers) == 0:
+            log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
+            return []
+
+        # take the fuzzy search sub-set of data and search it literally
+        df = self.df.loc[fuzzy_identifiers].copy()
+
+        literal_identifiers = self.literal_search(text, df)
+        if len(literal_identifiers) == 0:
+            log.debug(
+                f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
+            return fuzzy_identifiers
+
+        # append any fuzzy identifiers that were not found in the literal search
+        remaining_fuzzy_identifiers = [
+            _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
+        identifiers = literal_identifiers + remaining_fuzzy_identifiers
 
         log.debug(
             f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")

From 91a3328bfd732568e43029433a3e4adfff639171 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Mon, 18 Aug 2025 10:04:55 +0200
Subject: [PATCH 08/47] minor changes to searchengine

---
 .../bwutils/search/searchengine.py            | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index f1a32f4a1..cbb088afd 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -195,19 +195,19 @@ def list_to_q_grams(self, word_list: Iterable) -> dict:
 
     #   +++ Changes to searchable data
 
-    def add_identifier(self, identifier, data: dict) -> None:
+    def add_identifier(self, data: dict, make_searchable=[]) -> None:
         """Add this identifier to the search index.
 
         identifier is expected to be a unique identifier that has not been used before
         data is expected to be a dict of column names and data
         """
+        #TODO add ability to add new columns with make_searchable
+        identifier = data[self.identifier_name]
+
         # make sure we the identifier does not yet exist
         if identifier in self.df.index.to_list():
             raise Exception(
                 f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.")
-        if data[self.identifier_name] != identifier:
-            raise Exception(
-                f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.")
 
         df_cols = self.columns
 
@@ -273,9 +273,12 @@ def change_identifier(self, identifier, data: dict) -> None:
         if identifier not in self.df.index.to_list():
             raise Exception(
                 f"Identifier '{identifier}' does not exist in the search data, use an existing identifier or use the add_identifier function.")
-        if data[self.identifier_name] != identifier:
+        if self.identifier_name in data.keys() and data[self.identifier_name] != identifier:
             raise Exception(
-                f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.")
+                "Identifier field cannot be changed, first remove item and then add new identifier")
+        if "query_col" in data.keys():
+            log.debug(
+                f"Field 'query_col' is a protected field for search engine and will be ignored for changing {identifier}")
 
         # get existing data
         update_data = {col: self.df.loc[identifier, col] for col in self.df.columns}
@@ -289,7 +292,7 @@ def change_identifier(self, identifier, data: dict) -> None:
         self.remove_identifier(identifier)
 
         # add entry with new data
-        self.add_identifier(identifier, update_data)
+        self.add_identifier(update_data)
 
     #   +++ Search
 
@@ -661,6 +664,11 @@ def fuzzy_search(self, text: str) -> list:
     def search(self, text) -> list:
         """Search the dataframe on this text, return a sorted list of identifiers."""
         t = time()
+
+        if len(text) == 0:
+            log.debug(f"Empty search, returned all items")
+            return self.df.index.to_list()
+
         fuzzy_identifiers = self.fuzzy_search(text)
         if len(fuzzy_identifiers) == 0:
             log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
@@ -836,6 +844,10 @@ def search(self, text, database: Optional[str] = None) -> list:
         """Search the dataframe on this text, return a sorted list of identifiers."""
         t = time()
 
+        if len(text) == 0:
+            log.debug(f"Empty search, returned all items")
+            return self.df.index.to_list()
+
         # get the set of ids that is in this database
         if database is not None:
             self.database_ids = set(self.df[self.df["database"] == database].index.to_list())

From 8ec0cf401527b4d5e070662a54977fbc53493f16 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 2 Sep 2025 12:46:40 +0200
Subject: [PATCH 09/47] - Solve bug in OSA distance for early stopping with
 long similar strings - Add and improve tests

---
 .../bwutils/search/searchengine.py            |  92 ++++++++++----
 tests/test_search.py                          | 117 +++++++++++++-----
 2 files changed, 151 insertions(+), 58 deletions(-)

diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index cbb088afd..913242a3a 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -39,6 +39,8 @@ class SearchEngine:
 
     def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: list = []):
         t = time()
+        log.debug(f"SearchEngine initializing for {len(df)} items")
+
         # compile regex patterns for cleaning
         self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]")  # for replacing with empty string
         self.SPACE_PATTERN = re.compile(r"[-−:;]")  # for replacing with space
@@ -86,7 +88,8 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l
         self.df = pd.DataFrame()
 
         self.update_index(df)
-        log.debug(f"Search engine initialized in {time() - t:.2f} seconds for {len(self.df)} items")
+
+        log.debug(f"SearchEngine Initialized in {time() - t:.2f} seconds")
 
     #   +++ Utility functions
 
@@ -102,6 +105,9 @@ def update_dict(update_me: dict, new: dict) -> dict:
                     update_me[dict_key] = _counter
             return update_me
 
+        t = time()
+        size_old = len(self.df)
+
         # identifier to word and df
         i2w, update_df = self.words_in_df(update_df)
         self.identifier_to_word = update_dict(self.identifier_to_word, i2w)
@@ -119,6 +125,13 @@ def update_dict(update_me: dict, new: dict) -> dict:
         q2w = self.reverse_dict_many_to_one(w2q)
         self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w)
 
+        size_new = len(self.df)
+        size_dif = size_new - size_old
+        size_msg = (f"{size_dif} changed items at {round(size_dif/(time() - t), 0)} items/sec "
+                    f"({size_new} items currently)") if size_dif > 1 \
+            else f"1 changed item ({size_new} items currently)"
+        log.debug(f"Search index updated in {time() - t:.2f} seconds for {size_msg}.")
+
     def clean_text(self, text: str):
         """Clean a string so it doesn't contain weird characters or multiple spaces etc."""
         text = self.SUB_PATTERN.sub("", text.lower())
@@ -193,6 +206,13 @@ def list_to_q_grams(self, word_list: Iterable) -> dict:
 
         return q_gram_dict
 
+    def word_in_index(self, word: str) -> bool:
+        """Convenience function to check if a single word is in the search index."""
+        if " " in word:
+            raise Exception(
+                f"Given word '{word}' must not contain spaces.")
+        return word in self.word_to_identifier.keys()
+
     #   +++ Changes to searchable data
 
     def add_identifier(self, data: dict, make_searchable=[]) -> None:
@@ -228,9 +248,12 @@ def add_identifier(self, data: dict, make_searchable=[]) -> None:
         # update the search index data
         self.update_index(new_df)
 
-    def remove_identifier(self, identifier) -> None:
+    def remove_identifier(self, identifier, logging=True) -> None:
         """Remove this identifier from self.df and the search index.
         """
+        if logging:
+            t = time()
+
         # make sure the identifier exists
         if identifier not in self.df.index.to_list():
             raise Exception(
@@ -261,6 +284,10 @@ def remove_identifier(self, identifier) -> None:
         # finally, remove the identifier
         del self.identifier_to_word[identifier]
 
+        if logging:
+            log.debug(f"Search index updated in {time() - t:.2f} seconds "
+                      f"for 1 removed item ({len(self.df)} items currently).")
+
     def change_identifier(self, identifier, data: dict) -> None:
         """Change this identifier.
 
@@ -289,7 +316,7 @@ def change_identifier(self, identifier, data: dict) -> None:
             update_data[field] = value
 
         # remove the entry
-        self.remove_identifier(identifier)
+        self.remove_identifier(identifier, logging=False)
 
         # add entry with new data
         self.add_identifier(update_data)
@@ -380,7 +407,7 @@ def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: i
                     distance[i][j] = min(distance[i][j], transposition)
 
             # stop early if we surpass cutoff
-            if 0 < cutoff <= distance[i][j]:
+            if 0 < cutoff <= min(distance[i]):
                 return cutoff_return
         return distance[i][j]
 
@@ -428,19 +455,24 @@ def spell_check(self, text: str) -> OrderedDict:
 
         We rank alternative words based on 1) edit distance 2) how often a word is used in an entry
         If too many results are found, we only keep edit distance 1,
-        if we want more results, we keep with longer edit distance up to ...
-
+        if we want more results, we keep with longer edit distance up to `never_accept_this`
 
         word_results = OrderedDict(
-            "word": [word, work]
+            "word": [work]
             )
 
+        NOTE: only ALTERNATIVES are ever returned, this function returns empty list for item BOTH when
+            1) the exact word is in the data
+            2) when there are no suitable alternatives
         """
+        count_occurence = lambda x: sum(self.word_to_identifier[x].values())  # count occurences of a word
+
         word_results = OrderedDict()
 
-        matches_goal = 3  # ideally we have at least this many alternatives
-        always_accept_this = 1  # values of this or lower always accepted
-        never_accept_this = 4  # values this or over always rejected
+        matches_min = 3  # ideally we have at least this many alternatives
+        matches_max = 10  # ideally don't much more than this many matches
+        always_accept_this = 1  # values of this edit distance or lower always accepted
+        never_accept_this = 4  # values this edit distance or over always rejected
 
         # make list of unique words
         words = OrderedDict()
@@ -451,12 +483,12 @@ def spell_check(self, text: str) -> OrderedDict:
         words = [self.clean_text(word) for word in words]
 
         for word in words:
-
             # first, find possible matches quickly
             q_grams = self.text_to_positional_q_gram(word)
             possible_matches = self.find_q_gram_matches(set(q_grams))
 
             matches = []
+            first_matches = Counter()
             other_matches = {}
 
             # now, refine with edit distance
@@ -467,23 +499,33 @@ def spell_check(self, text: str) -> OrderedDict:
                 if edit_distance == 0:
                     continue  # we are looking for alternatives only, not the exact word
                 elif edit_distance <= always_accept_this:
-                    matches.append(row[1])
+                    first_matches[row[1]] = count_occurence(row[1])
                 elif edit_distance < never_accept_this:
-                    if other_matches.get(edit_distance):
-                        other_matches[edit_distance].append(row[1])
-                    else:
-                        other_matches[edit_distance] = [row[1]]
+                    if not other_matches.get(edit_distance):
+                        other_matches[edit_distance] = Counter()
+                    other_matches[edit_distance][row[1]] = count_occurence(row[1])
+                else:
+                    continue
 
+            # add matches in correct order:
+            for match, _ in first_matches.most_common():
+                matches.append(match)
             # if we have fewer matches than goal, add more 'less good' matches
-            if len(matches) < matches_goal:
+            if len(matches) < matches_min:
                 for i in range(always_accept_this + 1, never_accept_this):
-                    # iteratively increase 'worse' matches so we hit goal of minimum alternatives
+                    # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives
                     if new := other_matches.get(i):
-                        matches = matches + new
-                        if len(matches) >= matches_goal:
-                            break
-            word_results[word] = matches
+                        prev_num = 10e100
+                        for match, num in new.most_common():
+                            if num == prev_num:
+                                matches.append(match)
+                            elif num != prev_num and len(matches <= matches_max):
+                                matches.append(match)
+                            else:
+                                break
+                            prev_num = num
 
+            word_results[word] = matches
         return word_results
 
     def build_queries(self, query_text) -> list:
@@ -515,7 +557,7 @@ def weigh_identifiers(self, identifiers: Counter, weight: int, weighted_ids: Cou
             weighted_ids[identifier] += (weight * occurrences)
         return weighted_ids
 
-    def search_size_1(self, queries: list, original_words: list, orig_word_weight=5, exact_word_weight=1) -> dict:
+    def search_size_1(self, queries: list, original_words: set, orig_word_weight=5, exact_word_weight=1) -> dict:
         """Return a dict of {query_word: Counter(identifier)}.
 
         queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word
@@ -582,7 +624,7 @@ def fuzzy_search(self, text: str) -> list:
         for word in text.split(" "):
             orig_words[word] = False
         orig_words = orig_words.keys()
-        orig_words = [self.clean_text(word) for word in orig_words]
+        orig_words = {self.clean_text(word) for word in orig_words}
 
         # order the queries by the amount of words they contain
         # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space
@@ -749,7 +791,7 @@ def fuzzy_search(self, text: str) -> list:
         for word in text.split(" "):
             orig_words[word] = False
         orig_words = orig_words.keys()
-        orig_words = [self.clean_text(word) for word in orig_words]
+        orig_words = {self.clean_text(word) for word in orig_words}
 
         # order the queries by the amount of words they contain
         # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space
diff --git a/tests/test_search.py b/tests/test_search.py
index 231859d28..036e6c864 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -17,6 +17,7 @@ def data_for_test():
     columns = ["id", "col1", "col2"])
 
 
+# test standard init
 def test_search_init():
     """Do initialization tests."""
     df = data_for_test()
@@ -33,8 +34,90 @@ def test_search_init():
     se = SearchEngine(df, identifier_name="id")
 
 
+# test internals
+def test_reverse_dict():
+    """Do test to reverse the special Counter dict."""
+    df = data_for_test()
+    se = SearchEngine(df, identifier_name="id")
+
+    # reverse once and verify
+    w2i = se.reverse_dict_many_to_one(se.identifier_to_word)
+    assert w2i == se.word_to_identifier
+
+    # reverse same and verify is same as input
+    i2w = se.reverse_dict_many_to_one(w2i)
+    assert i2w == se.identifier_to_word
+
+
+def test_string_distance():
+    """Do tests specifically for string distance function."""
+    df = data_for_test()
+    se = SearchEngine(df, identifier_name="id")
+
+    # same word
+    assert se.osa_distance("coal", "coal") == 0
+    # empty string is length of other word
+    assert se.osa_distance("coal", "") == 4
+
+    # insert
+    assert se.osa_distance("coal", "coa") == 1
+    # delete
+    assert se.osa_distance("coal", "coall") == 1
+    # substitute
+    assert se.osa_distance("coal", "coat") == 1
+    # transpose
+    assert se.osa_distance("coal", "cola") == 1
+
+    # longer edit distance
+    assert se.osa_distance("coal", "chocolate") == 6
+    # reverse order gives same result
+    assert se.osa_distance("coal", "chocolate") == se.osa_distance("chocolate", "coal")
+    # cutoff
+    assert se.osa_distance("coal", "chocolate", cutoff=5, cutoff_return=1000) == 1000
+    assert se.osa_distance("coal", "chocolate", cutoff=6, cutoff_return=1000) == 1000
+    assert se.osa_distance("coal", "chocolate", cutoff=7, cutoff_return=1000) == 6
+    # length cutoff
+    assert se.osa_distance("coal", "coallongword") == 8
+    assert se.osa_distance("coal", "coallongword", cutoff=5, cutoff_return=1000) == 1000
+
+    # two entirely different words (test of early stopping)
+    assert se.osa_distance("brown", "jumped") == 6
+    assert se.osa_distance("brown", "jumped", cutoff=6, cutoff_return=1000) == 1000
+    assert se.osa_distance("brown", "jumped", cutoff=7, cutoff_return=1000) == 6
+
+
+# test functionality
+def test_in_index():
+    """Do checks for checking if word is in the index."""
+    df = data_for_test()
+    se = SearchEngine(df, identifier_name="id")
+
+    # use string with space
+    with pytest.raises(Exception):
+        se.word_in_index("coal and space")
+
+    assert se.word_in_index("coal")
+    assert not se.word_in_index("coa")
+
+
+def test_spellcheck():
+    """Do checks spell checking."""
+    df = data_for_test()
+    se = SearchEngine(df, identifier_name="id")
+
+    checked = se.spell_check("coa productions something flintstones")
+    # coal HAS to be first, it is found more often in the data
+    assert checked["coa"] == ["coal", "coat"]
+    # find production
+    assert checked["productions"] == ["production"]
+    # should be empty as there is no alternative (but this word occurs)
+    assert checked["something"] == []
+    # should be empty as there is no alternative (does not exist)
+    assert checked["flintstones"] == []
+
+
 def test_search_base():
-    """Do checks for search ranking."""
+    """Do checks for correct search ranking."""
 
     df = data_for_test()
 
@@ -147,35 +230,3 @@ def test_search_change_identifier():
     }
     se.change_identifier(identifier="a", data=new_edit_data)
     assert se.search("coal production") == ["c", "b", "d", "h", "a", "f", "g"]
-
-
-def test_string_distance():
-    """Do tests specifically for string distance function"""
-    df = data_for_test()
-    se = SearchEngine(df, identifier_name="id")
-
-    # same word
-    assert se.osa_distance("coal", "coal") == 0
-    # empty string is length of other word
-    assert se.osa_distance("coal", "") == 4
-
-    # insert
-    assert se.osa_distance("coal", "coa") == 1
-    # delete
-    assert se.osa_distance("coal", "coall") == 1
-    # substitute
-    assert se.osa_distance("coal", "coat") == 1
-    # transpose
-    assert se.osa_distance("coal", "cola") == 1
-
-    # longer edit distance
-    assert se.osa_distance("coal", "chocolate") == 6
-    # reverse order gives same result
-    assert se.osa_distance("coal", "chocolate") == se.osa_distance("chocolate", "coal")
-    # cutoff
-    assert se.osa_distance("coal", "chocolate", cutoff=5, cutoff_return=1000) == 1000
-    assert se.osa_distance("coal", "chocolate", cutoff=6, cutoff_return=1000) == 1000
-    assert se.osa_distance("coal", "chocolate", cutoff=7, cutoff_return=1000) == 6
-    # length cutoff
-    assert se.osa_distance("coal", "coallongword") == 8
-    assert se.osa_distance("coal", "coallongword", cutoff=5, cutoff_return=1000) == 1000

From e2bb1cf6f732b469e3558fb484982ba961cbb874 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 11:37:31 +0200
Subject: [PATCH 10/47] update add/change identifier (and tests) to accept
 dataframes instead of dicts

---
 .../bwutils/search/searchengine.py            | 91 +++++++++++--------
 tests/test_search.py                          | 56 ++++++------
 2 files changed, 80 insertions(+), 67 deletions(-)

diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py
index 913242a3a..aaf090eb6 100644
--- a/activity_browser/bwutils/search/searchengine.py
+++ b/activity_browser/bwutils/search/searchengine.py
@@ -78,7 +78,7 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l
         self.identifier_column = self.columns.index(self.identifier_name)
 
         # store all searchable column indices except the identifier
-        self.regular_columns = [i for i in range(len(self.columns)) if i != self.identifier_column]
+        self.searchable_columns = [i for i in range(len(self.columns)) if i != self.identifier_column]
 
         # initialize search index dicts and update df
         self.identifier_to_word = {}
@@ -112,6 +112,7 @@ def update_dict(update_me: dict, new: dict) -> dict:
         i2w, update_df = self.words_in_df(update_df)
         self.identifier_to_word = update_dict(self.identifier_to_word, i2w)
         self.df = pd.concat([self.df, update_df])
+        self.df = self.df.fillna("")  # ensure we don't add unwanted NA through concatenations
 
         # word to identifier
         w2i = self.reverse_dict_many_to_one(i2w)
@@ -163,7 +164,7 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
         df = df if any(df) else self.df
         return_df = df.copy()
 
-        df = df.iloc[:, self.regular_columns]
+        df = df.iloc[:, self.searchable_columns]
         identifier_word_dict = {}
         col = []
 
@@ -215,38 +216,47 @@ def word_in_index(self, word: str) -> bool:
 
     #   +++ Changes to searchable data
 
-    def add_identifier(self, data: dict, make_searchable=[]) -> None:
-        """Add this identifier to the search index.
+    def add_identifier(self, data: pd.DataFrame) -> None:
+        """Add this data to the search index.
 
-        identifier is expected to be a unique identifier that has not been used before
-        data is expected to be a dict of column names and data
+        identifier column is REQUIRED to be present
+        ALL data in the given dataframe will be added, if columns should not be added, they should be removed before
+        calling this function
         """
-        #TODO add ability to add new columns with make_searchable
-        identifier = data[self.identifier_name]
 
-        # make sure we the identifier does not yet exist
-        if identifier in self.df.index.to_list():
+        # ensure we have identifier column
+        if self.identifier_name not in data.columns:
             raise Exception(
-                f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.")
-
-        df_cols = self.columns
+                f"Identifier column '{self.identifier_name}' not in new data, impossible to add data without identifier")
 
-        # drop fields that are not in self.df
-        drop = [col for col in data if col not in df_cols]
-        for field in drop:
-            del data[field]
+        # make sure we the identifier does not yet exist
+        existing_ids = set(self.df.index.to_list())
+        for identifier in data[self.identifier_name]:
+            if identifier in existing_ids:
+                raise Exception(
+                    f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.")
 
-        # add empty field for missing data
+        df_cols = self.columns
+        # add cols to new data that are missing
         for col in df_cols:
-            if col not in data:
-                data[col] = ""
-
-        # convert to df
-        new_df = pd.DataFrame(data, index=[identifier])
-        new_df = new_df.astype(str)
+            if col not in data.columns:
+                data[col] = [""] * len(data)
+        # re-order cols, first existing, then new
+        new_cols = [col for col in data.columns if col not in self.columns if col not in set(df_cols)]
+        data_cols = df_cols + new_cols
+        data = data[data_cols]  # re-order new data to be in correct order
+
+        # add cols from new data to correct places
+        self.columns.extend(new_cols)
+        self.searchable_columns.extend([i for i, col in enumerate(data_cols) if col in new_cols])
+
+        # convert df
+        data = data.set_index(self.identifier_name, drop=False)
+        data = data.fillna("")
+        data = data.astype(str)
 
         # update the search index data
-        self.update_index(new_df)
+        self.update_index(data)
 
     def remove_identifier(self, identifier, logging=True) -> None:
         """Remove this identifier from self.df and the search index.
@@ -288,37 +298,40 @@ def remove_identifier(self, identifier, logging=True) -> None:
             log.debug(f"Search index updated in {time() - t:.2f} seconds "
                       f"for 1 removed item ({len(self.df)} items currently).")
 
-    def change_identifier(self, identifier, data: dict) -> None:
+    def change_identifier(self, identifier, data: pd.DataFrame) -> None:
         """Change this identifier.
 
-        identifier is expected to be a unique identifier that is in use
-        data is expected to be a dict of column names and data that change
-
-        only changed data needs to be supplied
+        identifier must be an identifier that is in use
+        data must be a dataframe of 1 row with all change data
+        data is overwritten with the new data in 'data', columns not given remain unchanged
         """
-        # make sure the identifier exists
+
+        # make sure only 1 change item is given
+        if len(data) > 1 or len(data) < 1:
+            raise Exception(
+                f"change data must be for exactly 1 identifier, but {len(data)} items were given.")
+        # make sure correct use of identifier
         if identifier not in self.df.index.to_list():
             raise Exception(
                 f"Identifier '{identifier}' does not exist in the search data, use an existing identifier or use the add_identifier function.")
-        if self.identifier_name in data.keys() and data[self.identifier_name] != identifier:
+        if self.identifier_name in data.columns and data[self.identifier_name].to_list() != [identifier]:
             raise Exception(
                 "Identifier field cannot be changed, first remove item and then add new identifier")
         if "query_col" in data.keys():
             log.debug(
                 f"Field 'query_col' is a protected field for search engine and will be ignored for changing {identifier}")
 
-        # get existing data
-        update_data = {col: self.df.loc[identifier, col] for col in self.df.columns}
-        del update_data["query_col"]
 
         # overwrite new data where relevant
-        for field, value in data.items():
-            update_data[field] = value
+        update_data = self.df.loc[[identifier], self.columns]
+        data = data.reset_index(drop=True)
+        for col in data.columns:
+            value = data.loc[0, col]
+            update_data[col] = [value]
 
         # remove the entry
         self.remove_identifier(identifier, logging=False)
-
-        # add entry with new data
+        # add entry with updated data
         self.add_identifier(update_data)
 
     #   +++ Search
diff --git a/tests/test_search.py b/tests/test_search.py
index 036e6c864..620c5c3b8 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -138,42 +138,43 @@ def test_search_add_identifier():
     df = data_for_test()
 
     # create base item to add
-    new_base_item = {
-        "id": "i",
-        "col1": "coal production",
-        "col2": "coal production"
-    }
+    new_base_item = pd.DataFrame([
+        ["i", "coal production", "coal production"],
+    ],
+        columns=["id", "col1", "col2"])
 
-    # use mismatched identifier and fail
+    # use existing identifier and fail
     se = SearchEngine(df, identifier_name="id")
+    wrong_id = new_base_item.copy()
+    wrong_id.iloc[0, 0] = "a"
     with pytest.raises(Exception):
-        se.add_identifier(identifier="j", data=new_base_item)
+        se.add_identifier(wrong_id)
 
-    # use existing identifier and fail
+    # add data without identifier column
     se = SearchEngine(df, identifier_name="id")
-    wrong_id = new_base_item.copy()
-    wrong_id["id"] = "a"
+    no_id = new_base_item.copy()
+    del no_id["id"]
     with pytest.raises(Exception):
-        se.add_identifier(identifier="a", data=wrong_id)
+        se.add_identifier(no_id)
 
-    # use column too many (should be removed)
+    # use column more (and find data in new col)
     se = SearchEngine(df, identifier_name="id")
     col_more = new_base_item.copy()
-    col_more["col3"] = "word"
-    se.add_identifier(identifier="i", data=col_more)
-    assert "col3" not in se.df.columns
+    col_more["col3"] = ["potatoes"]
+    se.add_identifier(col_more)
+    assert se.search("potatoes") == ["i"]
 
     # use column less (should be filled with empty string)
     se = SearchEngine(df, identifier_name="id")
     col_less = new_base_item.copy()
     del col_less["col2"]
-    se.add_identifier(identifier="i", data=col_less)
+    se.add_identifier(col_less)
     assert se.df.loc["i", "col2"] == ""
 
     # do search, add item and verify results are different
     se = SearchEngine(df, identifier_name="id")
     assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"]
-    se.add_identifier(identifier="i", data=new_base_item)
+    se.add_identifier(new_base_item)
     assert se.search("coal production") == ["i", "a", "c", "b", "d", "h", "f", "g"]
 
 
@@ -198,23 +199,22 @@ def test_search_change_identifier():
     df = data_for_test()
 
     # create base item to add
-    edit_data = {
-        "id": "a",
-        "col1": "cant find me anymore",
-        "col2": "something different"
-    }
+    edit_data = pd.DataFrame([
+        ["a", "cant find me anymore", "something different"],
+    ],
+        columns=["id", "col1", "col2"])
 
     # use non-existent identifier and fail
     se = SearchEngine(df, identifier_name="id")
     missing_id = edit_data.copy()
-    missing_id["id"] = "i"
+    missing_id["id"] = ["i"]
     with pytest.raises(Exception):
         se.change_identifier(identifier="i", data=missing_id)
 
     # use mismatched identifier and fail
     se = SearchEngine(df, identifier_name="id")
     wrong_id = edit_data.copy()
-    wrong_id["id"] = "i"
+    wrong_id["id"] = ["i"]
     with pytest.raises(Exception):
         se.change_identifier(identifier="a", data=wrong_id)
 
@@ -224,9 +224,9 @@ def test_search_change_identifier():
     se.change_identifier(identifier="a", data=edit_data)
     assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"]
     # now change the same item partially and verify results are different
-    new_edit_data = {
-        "id": "a",
-        "col1": "coal"
-    }
+    new_edit_data = pd.DataFrame([
+        ["a", "coal"],
+    ],
+        columns=["id", "col1"])
     se.change_identifier(identifier="a", data=new_edit_data)
     assert se.search("coal production") == ["c", "b", "d", "h", "a", "f", "g"]

From 2d6ca0f94659967808cafa0c062a881875b99ab2 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 11:38:33 +0200
Subject: [PATCH 11/47] update add/change identifier (and tests) to accept
 dataframes instead of dicts

---
 tests/test_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_search.py b/tests/test_search.py
index 620c5c3b8..52199a64e 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -44,7 +44,7 @@ def test_reverse_dict():
     w2i = se.reverse_dict_many_to_one(se.identifier_to_word)
     assert w2i == se.word_to_identifier
 
-    # reverse same and verify is same as input
+    # reverse again and verify is same as original
     i2w = se.reverse_dict_many_to_one(w2i)
     assert i2w == se.identifier_to_word
 

From 04053ab82de07d96d49f15a1407015d7b5a43ecd Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 11:40:20 +0200
Subject: [PATCH 12/47] move searchengine.py to bwutils instead of subfolder

---
 activity_browser/bwutils/search/__init__.py           | 1 -
 activity_browser/bwutils/{search => }/searchengine.py | 0
 2 files changed, 1 deletion(-)
 delete mode 100644 activity_browser/bwutils/search/__init__.py
 rename activity_browser/bwutils/{search => }/searchengine.py (100%)

diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py
deleted file mode 100644
index 85e30c9be..000000000
--- a/activity_browser/bwutils/search/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .searchengine import SearchEngine, MetaDataSearchEngine
\ No newline at end of file
diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/searchengine.py
similarity index 100%
rename from activity_browser/bwutils/search/searchengine.py
rename to activity_browser/bwutils/searchengine.py

From 478ed5dfb0680abd7f498517aab512dfab9a06d1 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 11:41:08 +0200
Subject: [PATCH 13/47] move searchengine.py to bwutils instead of subfolder

---
 activity_browser/bwutils/__init__.py | 2 +-
 tests/test_search.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/activity_browser/bwutils/__init__.py b/activity_browser/bwutils/__init__.py
index 5e97b3a2f..18839d5ac 100644
--- a/activity_browser/bwutils/__init__.py
+++ b/activity_browser/bwutils/__init__.py
@@ -13,7 +13,7 @@
 from .montecarlo import MonteCarloLCA
 from .multilca import MLCA, Contributions
 from .pedigree import PedigreeMatrix
-from .search import SearchEngine, MetaDataSearchEngine
+from .searchengine import SearchEngine, MetaDataSearchEngine
 from .sensitivity_analysis import GlobalSensitivityAnalysis
 from .superstructure import SuperstructureContributions, SuperstructureMLCA
 from .uncertainty import (CFUncertaintyInterface, ExchangeUncertaintyInterface,
diff --git a/tests/test_search.py b/tests/test_search.py
index 52199a64e..2bb038124 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -1,6 +1,6 @@
 import pytest
 import pandas as pd
-from activity_browser.bwutils.search import SearchEngine
+from activity_browser.bwutils import SearchEngine
 
 
 def data_for_test():

From 1c1300728d439d2808251a382929430b93d6c40e Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 12:08:54 +0200
Subject: [PATCH 14/47] move searchengine files

---
 .../bwutils/searchengine/__init__.py          |   2 +
 .../{searchengine.py => searchengine/base.py} | 185 -----------------
 .../bwutils/searchengine/metadata_search.py   | 196 ++++++++++++++++++
 3 files changed, 198 insertions(+), 185 deletions(-)
 create mode 100644 activity_browser/bwutils/searchengine/__init__.py
 rename activity_browser/bwutils/{searchengine.py => searchengine/base.py} (78%)
 create mode 100644 activity_browser/bwutils/searchengine/metadata_search.py

diff --git a/activity_browser/bwutils/searchengine/__init__.py b/activity_browser/bwutils/searchengine/__init__.py
new file mode 100644
index 000000000..7a7eae9c1
--- /dev/null
+++ b/activity_browser/bwutils/searchengine/__init__.py
@@ -0,0 +1,2 @@
+from base import SearchEngine
+from metadata_search import MetaDataSearchEngine
diff --git a/activity_browser/bwutils/searchengine.py b/activity_browser/bwutils/searchengine/base.py
similarity index 78%
rename from activity_browser/bwutils/searchengine.py
rename to activity_browser/bwutils/searchengine/base.py
index aaf090eb6..7f9d8158e 100644
--- a/activity_browser/bwutils/searchengine.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -746,188 +746,3 @@ def search(self, text) -> list:
         log.debug(
             f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
         return identifiers
-
-
-class MetaDataSearchEngine(SearchEngine):
-    def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
-        """Overwritten for extra database specific reduction of results.
-        """
-        n_q_grams = len(q_grams)
-
-        matches = {}
-
-        # find words that match our q-grams
-        for q_gram in q_grams:
-            if words := self.q_gram_to_word.get(q_gram, False):
-                # q_gram exists in our search index
-                for word in words:
-                    if isinstance(self.database_ids, set):
-                        # DATABASE SPECIFIC now filter on whether word is in the database
-                        in_db = False
-                        for _id in self.word_to_identifier[word]:
-                            if _id in self.database_ids:
-                                in_db = True
-                                break
-                    else:
-                        in_db = True
-                    if in_db:
-                        matches[word] = matches.get(word, 0) + words[word]
-
-        # if we find no results, return an empty dataframe
-        if len(matches) == 0:
-            return pd.DataFrame({"word": [], "matches": []})
-
-        # otherwise, create a dataframe and
-        # reduce search results to most relevant results
-        matches = {"word": matches.keys(), "matches": matches.values()}
-        matches = pd.DataFrame(matches)
-        max_q = max(matches["matches"])  # this has the most matching q-grams
-
-        # determine how many results we want to keep based on how good our results are
-        min_q = max(max_q * 0.32,  # have at least a third of q-grams of best match or...
-                    max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
-                        1))  # okay just do 1 q-gram if there are no more in the word
-
-        matches = matches[matches["matches"] >= min_q]
-        matches = matches.sort_values(by="matches", ascending=False)
-        matches = matches.reset_index(drop=True)
-
-        return matches.iloc[:min(len(matches), 2500), :]  # return at most this many results
-
-    def fuzzy_search(self, text: str) -> list:
-        """Overwritten for extra database specific reduction of results.
-        """
-        queries = self.build_queries(text)
-
-        # make list of unique original words
-        orig_words = OrderedDict()
-        for word in text.split(" "):
-            orig_words[word] = False
-        orig_words = orig_words.keys()
-        orig_words = {self.clean_text(word) for word in orig_words}
-
-        # order the queries by the amount of words they contain
-        # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space
-        queries_by_size = OrderedDict()
-        longest_query = max([len(q) for q in queries])
-        for query_len in range(1, longest_query + 1):
-            queries_by_size[query_len] = [q for q in queries if len(q) == query_len]
-
-        # first handle queries of length 1
-        query_to_identifier = self.search_size_1(queries_by_size[1], orig_words)
-
-        # DATABASE SPECIFIC ensure all identifiers are in the database
-        if isinstance(self.database_ids, set):
-            new_q2i = {}
-            for word, _ids in query_to_identifier.items():
-                keep = set.intersection(set(_ids.keys()), self.database_ids)
-                new_id_counter = Counter()
-                for _id in keep:
-                    new_id_counter[_id] = _ids[_id]
-                if len(new_id_counter) > 0:
-                    new_q2i[word] = new_id_counter
-            query_to_identifier = new_q2i
-
-        # get all results into a df, we rank further later
-        all_identifiers = set()
-        for id_list in [id_list for id_list in query_to_identifier.values()]:
-            all_identifiers.update(id_list)
-        search_df = self.df.loc[list(all_identifiers)]
-
-        # now, we search for combinations of query words and get only those identifiers
-        # we then reduce de search_df further for only those matching identifiers
-        # we then search the permutations of that set of words
-        for q_len, query_set in queries_by_size.items():
-            if q_len == 1:
-                # we already did these above
-                continue
-            for query in query_set:
-
-                # get the intersection of all identifiers
-                # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query
-                # this ensures we only ever search data where ALL items occur to substantially reduce search-space
-                # finally, make this a Counter (with each item=1) so we can properly weigh things later
-                query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
-                                                          query_to_identifier.get(q_word, False)])
-                if len(query_identifier_set) == 0:
-                    # there is no match for this combination of query words, skip
-                    break
-
-                # now we convert the query identifiers to a Counter of 'occurrence',
-                # where we weigh queries with only original words higher
-                query_identifiers = Counter()
-                for identifier in query_identifier_set:
-                    weight = 0
-                    for query_word in query:
-                        weight += query_to_identifier[query_word][identifier]
-
-                    query_identifiers[identifier] = weight
-
-                # we now add these identifiers to a counter for this query name,
-                query_name = " ".join(query)
-
-                weight = self.base_weight * q_len
-                query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter())
-
-                # now search for all permutations of this query combined with a space
-                query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)]
-                for query_perm in permutations(query):
-                    mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"])
-                    new_df = query_df.loc[mask].reset_index(drop=True)
-                    if len(new_df) == 0:
-                        # there is no match for this permutation of words, skip
-                        continue
-                    new_id_list = new_df[self.identifier_name]
-
-                    new_ids = Counter()
-                    for new_id in new_id_list:
-                        new_ids[new_id] = query_identifiers[new_id]
-
-                    # we weigh a combination of words that is next also to each other even higher than just the words separately
-                    query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight,
-                                                                             query_to_identifier[query_name])
-        # now finally, move to one object sorted list by highest score
-        all_identifiers = Counter()
-        for identifiers in query_to_identifier.values():
-            all_identifiers += identifiers
-
-        # now sort on highest weights and make list type
-        sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
-        return sorted_identifiers
-
-    def search(self, text, database: Optional[str] = None) -> list:
-        """Search the dataframe on this text, return a sorted list of identifiers."""
-        t = time()
-
-        if len(text) == 0:
-            log.debug(f"Empty search, returned all items")
-            return self.df.index.to_list()
-
-        # get the set of ids that is in this database
-        if database is not None:
-            self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
-        else:
-            self.database_ids = None
-
-        fuzzy_identifiers = self.fuzzy_search(text)
-        if len(fuzzy_identifiers) == 0:
-            log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
-            return []
-
-        # take the fuzzy search sub-set of data and search it literally
-        df = self.df.loc[fuzzy_identifiers].copy()
-
-        literal_identifiers = self.literal_search(text, df)
-        if len(literal_identifiers) == 0:
-            log.debug(
-                f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
-            return fuzzy_identifiers
-
-        # append any fuzzy identifiers that were not found in the literal search
-        remaining_fuzzy_identifiers = [
-            _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
-        identifiers = literal_identifiers + remaining_fuzzy_identifiers
-
-        log.debug(
-            f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
-        return identifiers
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
new file mode 100644
index 000000000..01a5f93aa
--- /dev/null
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -0,0 +1,196 @@
+from itertools import permutations
+from collections import Counter, OrderedDict
+from logging import getLogger
+from time import time
+from typing import Optional
+import pandas as pd
+
+from activity_browser.bwutils.searchengine import SearchEngine
+
+
+log = getLogger(__name__)
+
+
+class MetaDataSearchEngine(SearchEngine):
+    def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
+        """Overwritten for extra database specific reduction of results.
+        """
+        n_q_grams = len(q_grams)
+
+        matches = {}
+
+        # find words that match our q-grams
+        for q_gram in q_grams:
+            if words := self.q_gram_to_word.get(q_gram, False):
+                # q_gram exists in our search index
+                for word in words:
+                    if isinstance(self.database_ids, set):
+                        # DATABASE SPECIFIC now filter on whether word is in the database
+                        in_db = False
+                        for _id in self.word_to_identifier[word]:
+                            if _id in self.database_ids:
+                                in_db = True
+                                break
+                    else:
+                        in_db = True
+                    if in_db:
+                        matches[word] = matches.get(word, 0) + words[word]
+
+        # if we find no results, return an empty dataframe
+        if len(matches) == 0:
+            return pd.DataFrame({"word": [], "matches": []})
+
+        # otherwise, create a dataframe and
+        # reduce search results to most relevant results
+        matches = {"word": matches.keys(), "matches": matches.values()}
+        matches = pd.DataFrame(matches)
+        max_q = max(matches["matches"])  # this has the most matching q-grams
+
+        # determine how many results we want to keep based on how good our results are
+        min_q = max(max_q * 0.32,  # have at least a third of q-grams of best match or...
+                    max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
+                        1))  # okay just do 1 q-gram if there are no more in the word
+
+        matches = matches[matches["matches"] >= min_q]
+        matches = matches.sort_values(by="matches", ascending=False)
+        matches = matches.reset_index(drop=True)
+
+        return matches.iloc[:min(len(matches), 2500), :]  # return at most this many results
+
+    def fuzzy_search(self, text: str) -> list:
+        """Overwritten for extra database specific reduction of results.
+        """
+        queries = self.build_queries(text)
+
+        # make list of unique original words
+        orig_words = OrderedDict()
+        for word in text.split(" "):
+            orig_words[word] = False
+        orig_words = orig_words.keys()
+        orig_words = {self.clean_text(word) for word in orig_words}
+
+        # order the queries by the amount of words they contain
+        # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space
+        queries_by_size = OrderedDict()
+        longest_query = max([len(q) for q in queries])
+        for query_len in range(1, longest_query + 1):
+            queries_by_size[query_len] = [q for q in queries if len(q) == query_len]
+
+        # first handle queries of length 1
+        query_to_identifier = self.search_size_1(queries_by_size[1], orig_words)
+
+        # DATABASE SPECIFIC ensure all identifiers are in the database
+        if isinstance(self.database_ids, set):
+            new_q2i = {}
+            for word, _ids in query_to_identifier.items():
+                keep = set.intersection(set(_ids.keys()), self.database_ids)
+                new_id_counter = Counter()
+                for _id in keep:
+                    new_id_counter[_id] = _ids[_id]
+                if len(new_id_counter) > 0:
+                    new_q2i[word] = new_id_counter
+            query_to_identifier = new_q2i
+
+        # get all results into a df, we rank further later
+        all_identifiers = set()
+        for id_list in [id_list for id_list in query_to_identifier.values()]:
+            all_identifiers.update(id_list)
+        search_df = self.df.loc[list(all_identifiers)]
+
+        # now, we search for combinations of query words and get only those identifiers
+        # we then reduce de search_df further for only those matching identifiers
+        # we then search the permutations of that set of words
+        for q_len, query_set in queries_by_size.items():
+            if q_len == 1:
+                # we already did these above
+                continue
+            for query in query_set:
+
+                # get the intersection of all identifiers
+                # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query
+                # this ensures we only ever search data where ALL items occur to substantially reduce search-space
+                # finally, make this a Counter (with each item=1) so we can properly weigh things later
+                query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
+                                                          query_to_identifier.get(q_word, False)])
+                if len(query_identifier_set) == 0:
+                    # there is no match for this combination of query words, skip
+                    break
+
+                # now we convert the query identifiers to a Counter of 'occurrence',
+                # where we weigh queries with only original words higher
+                query_identifiers = Counter()
+                for identifier in query_identifier_set:
+                    weight = 0
+                    for query_word in query:
+                        weight += query_to_identifier[query_word][identifier]
+
+                    query_identifiers[identifier] = weight
+
+                # we now add these identifiers to a counter for this query name,
+                query_name = " ".join(query)
+
+                weight = self.base_weight * q_len
+                query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter())
+
+                # now search for all permutations of this query combined with a space
+                query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)]
+                for query_perm in permutations(query):
+                    mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"])
+                    new_df = query_df.loc[mask].reset_index(drop=True)
+                    if len(new_df) == 0:
+                        # there is no match for this permutation of words, skip
+                        continue
+                    new_id_list = new_df[self.identifier_name]
+
+                    new_ids = Counter()
+                    for new_id in new_id_list:
+                        new_ids[new_id] = query_identifiers[new_id]
+
+                    # we weigh a combination of words that is next also to each other even higher than just the words separately
+                    query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight,
+                                                                             query_to_identifier[query_name])
+        # now finally, move to one object sorted list by highest score
+        all_identifiers = Counter()
+        for identifiers in query_to_identifier.values():
+            all_identifiers += identifiers
+
+        # now sort on highest weights and make list type
+        sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
+        return sorted_identifiers
+
+    def search(self, text, database: Optional[str] = None) -> list:
+        """Search the dataframe on this text, return a sorted list of identifiers."""
+        t = time()
+
+        if len(text) == 0:
+            log.debug(f"Empty search, returned all items")
+            return self.df.index.to_list()
+
+        # get the set of ids that is in this database
+        if database is not None:
+            self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
+        else:
+            self.database_ids = None
+
+        fuzzy_identifiers = self.fuzzy_search(text)
+        if len(fuzzy_identifiers) == 0:
+            log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
+            return []
+
+        # take the fuzzy search sub-set of data and search it literally
+        df = self.df.loc[fuzzy_identifiers].copy()
+
+        literal_identifiers = self.literal_search(text, df)
+        if len(literal_identifiers) == 0:
+            log.debug(
+                f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
+            return fuzzy_identifiers
+
+        # append any fuzzy identifiers that were not found in the literal search
+        remaining_fuzzy_identifiers = [
+            _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
+        identifiers = literal_identifiers + remaining_fuzzy_identifiers
+
+        log.debug(
+            f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
+        return identifiers

From 646b3beeba41ab5b4a43a77fc55e58696847ba9e Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 12:10:20 +0200
Subject: [PATCH 15/47] move searchengine files

---
 activity_browser/bwutils/searchengine/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/__init__.py b/activity_browser/bwutils/searchengine/__init__.py
index 7a7eae9c1..a3ed1d8e1 100644
--- a/activity_browser/bwutils/searchengine/__init__.py
+++ b/activity_browser/bwutils/searchengine/__init__.py
@@ -1,2 +1,2 @@
-from base import SearchEngine
-from metadata_search import MetaDataSearchEngine
+from .base import SearchEngine
+from .metadata_search import MetaDataSearchEngine

From 39af7634a4ad43cd6aceec76f638989f7198dc4c Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 13:27:26 +0200
Subject: [PATCH 16/47] metadata and search size logging

---
 activity_browser/bwutils/metadata.py          | 15 +++++++++---
 activity_browser/bwutils/searchengine/base.py | 23 +++++++++++++++----
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index 2e665cdd7..b3d6a7967 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -2,11 +2,13 @@
 import itertools
 import sqlite3
 import pickle
+import sys
 from time import time
 from functools import lru_cache
 from typing import Set
 from logging import getLogger
 
+from playhouse.shortcuts import model_to_dict
 import pandas as pd
 
 from qtpy.QtCore import Qt, QObject, Signal, SignalInstance
@@ -15,7 +17,7 @@
 from bw2data.errors import UnknownObject
 from bw2data.backends import sqlite3_lci_db, ActivityDataset
 
-from activity_browser.bwutils.search import MetaDataSearchEngine
+from activity_browser.bwutils.searchengine import MetaDataSearchEngine
 
 from activity_browser import signals
 
@@ -183,6 +185,7 @@ def _get_database(self, db_name: str) -> pd.DataFrame | None:
 
     def sync(self) -> None:
         """Deletes metadata when the project is changed."""
+        t = time()
         log.debug("Synchronizing MetaDataStore")
 
         con = sqlite3.connect(sqlite3_lci_db._filepath)
@@ -190,8 +193,14 @@ def sync(self) -> None:
         con.close()
 
         self.dataframe = self._parse_df(node_df)
-        self.init_search()  # init search index
 
+        size_bytes = sys.getsizeof(self.dataframe)
+        if size_bytes < 1024 ** 3:
+            size =  f"{size_bytes / (1024 ** 2):.1f} MB"
+        else:
+            size =  f"{size_bytes / (1024 ** 3):.2f} GB"
+        log.debug(f"MetaDataStore Synchronized in {time() - t:.2f} seconds for {len(self.dataframe)} items ({size}))")
+        self.init_search()  # init search index
         self.synced.emit()
 
     def _parse_df(self, raw_df: pd.DataFrame) -> pd.DataFrame:
@@ -351,7 +360,7 @@ def init_search(self):
             "product", "reference product", "classifications", "location", "properties"  # activity specific
         ]
 
-        MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols)
+        self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols)
 
 
 AB_metadata = MetaDataStore()
diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 7f9d8158e..293ddd230 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import numpy as np
 import re
+import sys
 
 
 log = getLogger(__name__)
@@ -128,9 +129,9 @@ def update_dict(update_me: dict, new: dict) -> dict:
 
         size_new = len(self.df)
         size_dif = size_new - size_old
-        size_msg = (f"{size_dif} changed items at {round(size_dif/(time() - t), 0)} items/sec "
-                    f"({size_new} items currently)") if size_dif > 1 \
-            else f"1 changed item ({size_new} items currently)"
+        size_msg = (f"{size_dif} changed items at {int(round(size_dif/(time() - t), 0))} items/sec "
+                    f"({size_new} items ({self.size_of_index()}) currently)") if size_dif > 1 \
+            else f"1 changed item ({size_new} items ({self.size_of_index()}) currently)"
         log.debug(f"Search index updated in {time() - t:.2f} seconds for {size_msg}.")
 
     def clean_text(self, text: str):
@@ -214,6 +215,20 @@ def word_in_index(self, word: str) -> bool:
                 f"Given word '{word}' must not contain spaces.")
         return word in self.word_to_identifier.keys()
 
+    def size_of_index(self):
+        """return the size of the search index in MB or GB."""
+        s_df = sys.getsizeof(self.df)
+        s_i2w = sys.getsizeof(self.identifier_to_word)
+        s_w2i = sys.getsizeof(self.word_to_identifier)
+        s_w2q = sys.getsizeof(self.word_to_q_grams)
+        s_q2w = sys.getsizeof(self.q_gram_to_word)
+        size_bytes = s_df + s_i2w + s_w2i + s_w2q + s_q2w
+
+        if size_bytes < 1024 ** 3:
+            return f"{size_bytes / (1024 ** 2):.1f} MB"
+        else:
+            return f"{size_bytes / (1024 ** 3):.2f} GB"
+
     #   +++ Changes to searchable data
 
     def add_identifier(self, data: pd.DataFrame) -> None:
@@ -296,7 +311,7 @@ def remove_identifier(self, identifier, logging=True) -> None:
 
         if logging:
             log.debug(f"Search index updated in {time() - t:.2f} seconds "
-                      f"for 1 removed item ({len(self.df)} items currently).")
+                      f"for 1 removed item ({len(self.df)} items ({self.size_of_index()}) currently).")
 
     def change_identifier(self, identifier, data: pd.DataFrame) -> None:
         """Change this identifier.

From fad8a06f6d5c3af2668fd7e2a35cf408c98710bf Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 16:27:11 +0200
Subject: [PATCH 17/47] - Faster results with large data and short queries -
 solve bracket in wrong place breaking matchfinding

---
 activity_browser/bwutils/searchengine/base.py          | 10 ++++++----
 .../bwutils/searchengine/metadata_search.py            |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 293ddd230..618585991 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -257,7 +257,8 @@ def add_identifier(self, data: pd.DataFrame) -> None:
             if col not in data.columns:
                 data[col] = [""] * len(data)
         # re-order cols, first existing, then new
-        new_cols = [col for col in data.columns if col not in self.columns if col not in set(df_cols)]
+        df_col_set = set(df_cols)
+        new_cols = [col for col in data.columns if col not in self.columns if col not in df_col_set]
         data_cols = df_cols + new_cols
         data = data[data_cols]  # re-order new data to be in correct order
 
@@ -285,7 +286,7 @@ def remove_identifier(self, identifier, logging=True) -> None:
                 f"Identifier '{identifier}' does not exist in the search data, cannot remove identifier that do not exist.")
 
         # remove from df
-        self.df.drop(identifier, inplace=True)
+        self.df = self.df.drop(identifier)
 
         # find words that may need to be removed
         words = self.identifier_to_word[identifier]
@@ -547,7 +548,7 @@ def spell_check(self, text: str) -> OrderedDict:
                         for match, num in new.most_common():
                             if num == prev_num:
                                 matches.append(match)
-                            elif num != prev_num and len(matches <= matches_max):
+                            elif num != prev_num and len(matches) <= matches_max:
                                 matches.append(match)
                             else:
                                 break
@@ -754,8 +755,9 @@ def search(self, text) -> list:
             return fuzzy_identifiers
 
         # append any fuzzy identifiers that were not found in the literal search
+        literal_id_set = set(literal_identifiers)
         remaining_fuzzy_identifiers = [
-            _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
+            _id for _id in fuzzy_identifiers if _id not in literal_id_set]
         identifiers = literal_identifiers + remaining_fuzzy_identifiers
 
         log.debug(
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 01a5f93aa..43332e939 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -187,8 +187,9 @@ def search(self, text, database: Optional[str] = None) -> list:
             return fuzzy_identifiers
 
         # append any fuzzy identifiers that were not found in the literal search
+        literal_id_set = set(literal_identifiers)
         remaining_fuzzy_identifiers = [
-            _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)]
+            _id for _id in fuzzy_identifiers if _id not in literal_id_set]
         identifiers = literal_identifiers + remaining_fuzzy_identifiers
 
         log.debug(

From 1aad95b421d15c8ec8e9bd406a5cea47c802689c Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Wed, 3 Sep 2025 18:12:53 +0200
Subject: [PATCH 18/47] Base implementation of better search in
 ActivitiesProducts table

---
 activity_browser/bwutils/metadata.py          |  7 ++-
 activity_browser/bwutils/searchengine/base.py |  4 +-
 .../bwutils/searchengine/metadata_search.py   | 20 ++++++-
 .../layouts/panes/database_products.py        | 56 ++++++++++++++++++-
 activity_browser/ui/widgets/item_model.py     | 19 +++++--
 activity_browser/ui/widgets/treeview.py       |  5 +-
 6 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index b3d6a7967..7e1ff5e1f 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -5,7 +5,7 @@
 import sys
 from time import time
 from functools import lru_cache
-from typing import Set
+from typing import Set, Optional
 from logging import getLogger
 
 from playhouse.shortcuts import model_to_dict
@@ -362,5 +362,10 @@ def init_search(self):
 
         self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols)
 
+    def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False):
+        return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter)
+
+    def search(self, query:str):
+        return self.search_engine.search(query)
 
 AB_metadata = MetaDataStore()
diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 618585991..f01b941a7 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -628,7 +628,7 @@ def search_size_1(self, queries: list, original_words: set, orig_word_weight=5,
 
         return matched_identifiers
 
-    def fuzzy_search(self, text: str) -> list:
+    def fuzzy_search(self, text: str, return_counter: bool = False) -> list:
         """Search the dataframe, finding approximate matches and return a list of identifiers,
         ranked by how well each identifier matches the search text.
 
@@ -728,6 +728,8 @@ def fuzzy_search(self, text: str) -> list:
         for identifiers in query_to_identifier.values():
             all_identifiers += identifiers
 
+        if return_counter:
+            return all_identifiers
         # now sort on highest weights and make list type
         sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
         return sorted_identifiers
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 43332e939..624e2365b 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -57,9 +57,20 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
 
         return matches.iloc[:min(len(matches), 2500), :]  # return at most this many results
 
-    def fuzzy_search(self, text: str) -> list:
+    def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True) -> list:
         """Overwritten for extra database specific reduction of results.
         """
+        if len(text) == 0:
+            log.debug(f"Empty search, returned all items")
+            return self.df.index.to_list()
+        t = time()
+
+        # DATABASE SPECIFIC get the set of ids that is in this database
+        if database is not None:
+            self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
+        else:
+            self.database_ids = None
+
         queries = self.build_queries(text)
 
         # make list of unique original words
@@ -154,6 +165,11 @@ def fuzzy_search(self, text: str) -> list:
         for identifiers in query_to_identifier.values():
             all_identifiers += identifiers
 
+        if logging:
+            log.debug(
+                f"Found {len(all_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
+        if return_counter:
+            return all_identifiers
         # now sort on highest weights and make list type
         sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
         return sorted_identifiers
@@ -172,7 +188,7 @@ def search(self, text, database: Optional[str] = None) -> list:
         else:
             self.database_ids = None
 
-        fuzzy_identifiers = self.fuzzy_search(text)
+        fuzzy_identifiers = self.fuzzy_search(text, database=database, logging=False)
         if len(fuzzy_identifiers) == 0:
             log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
             return []
diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py
index 4a49851c5..4439e140a 100644
--- a/activity_browser/layouts/panes/database_products.py
+++ b/activity_browser/layouts/panes/database_products.py
@@ -1,5 +1,6 @@
 from logging import getLogger
 from time import time
+from collections import Counter
 
 import pandas as pd
 from qtpy import QtWidgets, QtCore, QtGui
@@ -56,6 +57,8 @@ def __init__(self, parent, db_name: str):
         self.table_view = ProductView(self)
         self.table_view.setModel(self.model)
         self.model.setDataFrame(self.build_df())
+        self.model.has_external_search = True
+        self.model.external_col_name = db_name
 
         self.search = widgets.ABLineEdit(self)
         self.search.setMaximumHeight(30)
@@ -81,7 +84,11 @@ def connect_signals(self):
         signals.database.deleted.connect(self.on_database_deleted)
 
         self.table_view.filtered.connect(self.search_error)
-        self.search.textChangedDebounce.connect(self.table_view.setAllFilter)
+        self.search.textChangedDebounce.connect(self.set_queries)
+
+    def set_queries(self, query: str) -> None:
+        self.model.set_external_query(query)
+        self.table_view.setAllFilter(query)
 
     def saveState(self):
         """
@@ -360,6 +367,27 @@ def selected_activities(self) -> [tuple]:
         items = [i.internalPointer() for i in self.selectedIndexes() if isinstance(i.internalPointer(), ProductItem)]
         return list({item["activity_key"] for item in items if item["activity_key"] is not None})
 
+    def buildQuery(self) -> str:
+        queries = ["(index == index)"]
+
+        # query for the column filters
+        for col in list(self.columnFilters):
+            if col not in self.model().columns():
+                del self.columnFilters[col]
+
+        for col, query in self.columnFilters.items():
+            q = f"({col}.astype('str').str.contains('{self.format_query(query)}'))"
+            queries.append(q)
+
+        # query for the all filter
+        if self.allFilter.startswith('='):
+            queries.append(f"({self.allFilter[1:]})")
+
+        query = " & ".join(queries)
+        log.debug(f"{self.__class__.__name__} built query: {query}")
+
+        return query
+
 
 class ProductItem(ui.widgets.ABDataItem):
     """
@@ -454,3 +482,29 @@ def values_from_indices(key: str, indices: list[QtCore.QModelIndex]):
                 continue
             values.append(item[key])
         return values
+
+    def external_search(self, query):
+        results = AB_metadata.db_search(query, database=self.external_col_name, return_counter=True)
+
+        # extract a dict with 'key' as key and 'id' as values from the metadata
+        result_ids = set(results.keys())
+        # extract df with only result IDs and columns 'id' and 'key'
+        df = AB_metadata.dataframe[AB_metadata.dataframe["id"].isin(result_ids)].loc[:, ["id", "key"]]
+        df = df.set_index("key", drop=True)
+        translate_dict = df.to_dict()["id"]
+
+        # convert the metadata id scores to row id scores
+        row_scores = Counter()
+        df = self.dataframe.copy()
+        act_idx = set(df[df["activity_key"].isin(translate_dict.keys())].index.to_list())
+        prd_idx = set(df[df["product_key"].isin(translate_dict.keys())].index.to_list())
+        indices = act_idx | prd_idx  # combine the two sets ('|' is a set union)
+        # iterate over the indices
+        for index in indices:
+            act_score = results.get(translate_dict.get(df.loc[index, "activity_key"]), 0)
+            prd_score = results.get(translate_dict.get(df.loc[index, "product_key"]), 0)
+            row_scores[index] = act_score + prd_score
+
+        # finally only return the indices
+        sorted_indices = [identifier[0] for identifier in row_scores.most_common()]
+        return sorted_indices
diff --git a/activity_browser/ui/widgets/item_model.py b/activity_browser/ui/widgets/item_model.py
index 62c7b040a..696a6f9dc 100644
--- a/activity_browser/ui/widgets/item_model.py
+++ b/activity_browser/ui/widgets/item_model.py
@@ -26,6 +26,9 @@ def __init__(self, parent=None, dataframe=None):
         self.sort_column: int = 0  # column that is currently sorted
         self.sort_order: Qt.SortOrder = Qt.SortOrder.AscendingOrder
         self._query = ""  # Pandas query currently applied to the dataframe
+        self.has_external_search = False
+        self._external_query = ""
+        self.external_col_name = ""
 
         self.setDataFrame(self.dataframe)
 
@@ -192,7 +195,11 @@ def endResetModel(self):
 
         # apply any queries to the dataframe
         if q := self.query():
-            df = self.dataframe.query(q).reset_index(drop=True).copy()
+            df = self.dataframe.copy()
+            if self.has_external_search and self._external_query != "":
+                indices = self.external_search(self._external_query)
+                df = df.loc[indices]
+            df = df.query(q).reset_index(drop=True)
         else:
             df = self.dataframe.copy()
 
@@ -271,11 +278,15 @@ def setQuery(self, query: str):
         self._query = query
         self.endResetModel()
 
+    def set_external_query(self, query: str):
+        if not query.startswith("="):
+            self._external_query = query
+
+    def external_search(self, query):
+        NotImplementedError
+
     def hasChildren(self, parent: QtCore.QModelIndex):
         item = parent.internalPointer()
         if isinstance(item, ABAbstractItem):
             return item.has_children()
         return super().hasChildren(parent)
-
-
-
diff --git a/activity_browser/ui/widgets/treeview.py b/activity_browser/ui/widgets/treeview.py
index 89cb49aa0..36222b1bb 100644
--- a/activity_browser/ui/widgets/treeview.py
+++ b/activity_browser/ui/widgets/treeview.py
@@ -6,6 +6,7 @@
 from qtpy.QtCore import Qt
 
 from .item_model import ABItemModel
+from activity_browser.ui import widgets
 
 log = getLogger(__name__)
 
@@ -25,11 +26,11 @@ def __init__(self, pos: QtCore.QPoint, view: "ABTreeView"):
             col_index = view.columnAt(pos.x())
             col_name = model.columns()[col_index]
 
-            search_box = QtWidgets.QLineEdit(self)
+            search_box = widgets.ABLineEdit(self)
             search_box.setText(view.columnFilters.get(col_name, ""))
             search_box.setPlaceholderText("Search")
             search_box.selectAll()
-            search_box.textChanged.connect(lambda query: view.setColumnFilter(col_name, query))
+            search_box.textChangedDebounce.connect(lambda query: view.setColumnFilter(col_name, query))
             widget_action = QtWidgets.QWidgetAction(self)
             widget_action.setDefaultWidget(search_box)
             self.addAction(widget_action)

From 5b0a965670edc6951efd4a464f02c0aa199073bf Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 4 Sep 2025 08:21:48 +0200
Subject: [PATCH 19/47] check all newly added items are unique

---
 activity_browser/bwutils/searchengine/base.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index f01b941a7..d975726c2 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -244,13 +244,18 @@ def add_identifier(self, data: pd.DataFrame) -> None:
             raise Exception(
                 f"Identifier column '{self.identifier_name}' not in new data, impossible to add data without identifier")
 
-        # make sure we the identifier does not yet exist
+        # make sure we the new identifiers do not yet exist
         existing_ids = set(self.df.index.to_list())
         for identifier in data[self.identifier_name]:
             if identifier in existing_ids:
                 raise Exception(
                     f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.")
 
+        # make sure all new identifiers given are unique
+        if data[self.identifier_name].nunique() != data.shape[0]:
+            raise KeyError(
+                f"Identifier column {self.identifier_name} must only contain unique values. Found {data[self.identifier_name].nunique()} unique values for length {data.shape[0]}")
+
         df_cols = self.columns
         # add cols to new data that are missing
         for col in df_cols:

From 9ee34503703a83d2d50bd6e5024b74009a1c7c09 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 4 Sep 2025 16:17:49 +0200
Subject: [PATCH 20/47] dont allow sorting of table when search engine in use

---
 activity_browser/ui/widgets/item_model.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/activity_browser/ui/widgets/item_model.py b/activity_browser/ui/widgets/item_model.py
index 09c9d7fc3..3a2fd6d74 100644
--- a/activity_browser/ui/widgets/item_model.py
+++ b/activity_browser/ui/widgets/item_model.py
@@ -203,13 +203,14 @@ def endResetModel(self):
         else:
             df = self.dataframe.copy()
 
-        if not self.sort_column > len(self.columns()) - 1:
-            # apply the sorting
-            df.sort_values(
-                by=self.columns()[self.sort_column],
-                ascending=(self.sort_order == Qt.SortOrder.AscendingOrder),
-                inplace=True, ignore_index=True
-            )
+        if not (self.has_external_search and self._external_query != ""):
+            if not self.sort_column > len(self.columns()) - 1:
+                # apply the sorting
+                df.sort_values(
+                    by=self.columns()[self.sort_column],
+                    ascending=(self.sort_order == Qt.SortOrder.AscendingOrder),
+                    inplace=True, ignore_index=True
+                )
 
         # rebuild the ABItem tree
         self.root = self.branchItemClass("root")
@@ -281,6 +282,8 @@ def setQuery(self, query: str):
     def set_external_query(self, query: str):
         if not query.startswith("="):
             self._external_query = query
+        else:
+            self._external_query = ""
 
     def external_search(self, query):
         NotImplementedError

From e92d298dce86a11bad237cae3821a99bdf3beac7 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 4 Sep 2025 16:19:32 +0200
Subject: [PATCH 21/47] resolve search bug with multiple typos not working

---
 activity_browser/bwutils/searchengine/base.py | 25 ++++++++++++-------
 .../bwutils/searchengine/metadata_search.py   | 24 ++++++++++++------
 tests/test_search.py                          |  6 +++++
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index d975726c2..75b81424f 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -43,8 +43,8 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l
         log.debug(f"SearchEngine initializing for {len(df)} items")
 
         # compile regex patterns for cleaning
-        self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]")  # for replacing with empty string
-        self.SPACE_PATTERN = re.compile(r"[-−:;]")  # for replacing with space
+        self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"…]")  # for replacing with empty string
+        self.SPACE_PATTERN = re.compile(r"[-−:;/+]")  # for replacing with space
         self.ONE_SPACE_PATTERN = re.compile(r"\s+")  # for replacing multiple white space with 1 space
 
         self.q = 2  # character length of q grams
@@ -471,9 +471,10 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
         max_q = max(matches["matches"])  # this has the most matching q-grams
 
         # determine how many results we want to keep based on how good our results are
-        min_q = max(max_q * 0.32,  # have at least a third of q-grams of best match or...
-                    max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
-                        1))  # okay just do 1 q-gram if there are no more in the word
+        min_q = min(max(max_q * 0.32,  # have at least a third of q-grams of best match or...
+                        max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
+                            1)),  # okay just do 1 q-gram if there are no more in the word
+                    max_q)  # never have min_q be over max_q
 
         matches = matches[matches["matches"] >= min_q]
         matches = matches.sort_values(by="matches", ascending=False)
@@ -650,6 +651,7 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list:
 
         Finally, all found identifiers are sorted on their weight and returned.
         """
+        text = text.strip()
 
         queries = self.build_queries(text)
 
@@ -684,13 +686,16 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list:
                 # we already did these above
                 continue
             for query in query_set:
-
                 # get the intersection of all identifiers
                 # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query
                 # this ensures we only ever search data where ALL items occur to substantially reduce search-space
                 # finally, make this a Counter (with each item=1) so we can properly weigh things later
-                query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
-                                                          query_to_identifier.get(q_word, False)])
+                query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if
+                                 query_to_identifier.get(q_word, False)]
+                if len(query_id_sets) > 0:
+                    query_identifier_set = set.intersection(*query_id_sets)
+                else:
+                    query_identifier_set = set()
                 if len(query_identifier_set) == 0:
                     # there is no match for this combination of query words, skip
                     break
@@ -701,7 +706,8 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list:
                 for identifier in query_identifier_set:
                     weight = 0
                     for query_word in query:
-                        weight += query_to_identifier[query_word][identifier]
+                        # if the query_word and identifier combination exist get score, otherwise 0
+                        weight += query_to_identifier.get(query_word, {}).get(identifier, 0)
 
                     query_identifiers[identifier] = weight
 
@@ -742,6 +748,7 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list:
     def search(self, text) -> list:
         """Search the dataframe on this text, return a sorted list of identifiers."""
         t = time()
+        text = text.strip()
 
         if len(text) == 0:
             log.debug(f"Empty search, returned all items")
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 624e2365b..c09c28aa8 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -47,9 +47,10 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
         max_q = max(matches["matches"])  # this has the most matching q-grams
 
         # determine how many results we want to keep based on how good our results are
-        min_q = max(max_q * 0.32,  # have at least a third of q-grams of best match or...
-                    max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
-                        1))  # okay just do 1 q-gram if there are no more in the word
+        min_q = min(max(max_q * 0.32,  # have at least a third of q-grams of best match or...
+                        max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
+                            1)),  # okay just do 1 q-gram if there are no more in the word
+                    max_q)  # never have min_q be over max_q
 
         matches = matches[matches["matches"] >= min_q]
         matches = matches.sort_values(by="matches", ascending=False)
@@ -60,10 +61,12 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
     def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True) -> list:
         """Overwritten for extra database specific reduction of results.
         """
+        t = time()
+        text = text.strip()
+
         if len(text) == 0:
             log.debug(f"Empty search, returned all items")
             return self.df.index.to_list()
-        t = time()
 
         # DATABASE SPECIFIC get the set of ids that is in this database
         if database is not None:
@@ -116,13 +119,16 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
                 # we already did these above
                 continue
             for query in query_set:
-
                 # get the intersection of all identifiers
                 # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query
                 # this ensures we only ever search data where ALL items occur to substantially reduce search-space
                 # finally, make this a Counter (with each item=1) so we can properly weigh things later
-                query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if
-                                                          query_to_identifier.get(q_word, False)])
+                query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if
+                                 query_to_identifier.get(q_word, False)]
+                if len(query_id_sets) > 0:
+                    query_identifier_set = set.intersection(*query_id_sets)
+                else:
+                    query_identifier_set = set()
                 if len(query_identifier_set) == 0:
                     # there is no match for this combination of query words, skip
                     break
@@ -133,7 +139,8 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
                 for identifier in query_identifier_set:
                     weight = 0
                     for query_word in query:
-                        weight += query_to_identifier[query_word][identifier]
+                        # if the query_word and identifier combination exist get score, otherwise 0
+                        weight += query_to_identifier.get(query_word, {}).get(identifier, 0)
 
                     query_identifiers[identifier] = weight
 
@@ -177,6 +184,7 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
     def search(self, text, database: Optional[str] = None) -> list:
         """Search the dataframe on this text, return a sorted list of identifiers."""
         t = time()
+        text = text.strip()
 
         if len(text) == 0:
             log.debug(f"Empty search, returned all items")
diff --git a/tests/test_search.py b/tests/test_search.py
index 2bb038124..6d63c14ee 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -127,6 +127,12 @@ def test_search_base():
     assert se.search("coal") == ["a", "h", "c", "b", "d", "g", "f"]
     # do search on other term
     assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"]
+    # do search on typo
+    assert se.search("cola") == ["a", "c", "h", "b", "d", "f", "g"]
+    # do search on longer typo
+    assert se.search("cola production") == ["c", "a", "b", "d", "h", "f", "g"]
+    # do search on something we will definitely not find
+    assert se.search("dontFindThis") == []
 
     # init search class with 1 col searchable
     se = SearchEngine(df, identifier_name="id", searchable_columns=["col2"])

From 83ae1621f6329679c190ff7b97beab0eb1600008 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 4 Sep 2025 18:01:59 +0200
Subject: [PATCH 22/47] First version of autocomplete

---
 activity_browser/bwutils/metadata.py          |  5 ++
 .../bwutils/searchengine/metadata_search.py   | 72 +++++++++++++++++++
 .../layouts/panes/database_products.py        |  3 +-
 activity_browser/ui/widgets/__init__.py       |  2 +-
 activity_browser/ui/widgets/line_edit.py      | 36 +++++++++-
 5 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index 7e1ff5e1f..70ab0606f 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -368,4 +368,9 @@ def db_search(self, query:str, database: Optional[str] = None, return_counter: b
     def search(self, query:str):
         return self.search_engine.search(query)
 
+    def auto_complete(self, word:str, database: Optional[str] = None):
+        word = self.search_engine.clean_text(word)
+        completions = self.search_engine.auto_complete(word, database)
+        return completions
+
 AB_metadata = MetaDataStore()
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index c09c28aa8..ef6162723 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -12,6 +12,78 @@
 
 
 class MetaDataSearchEngine(SearchEngine):
+
+    def auto_complete(self, text: str, database) -> OrderedDict:
+        """Based on spellchecker, make more useful for autocompletions
+        """
+        if database is not None:
+            self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
+        else:
+            self.database_ids = None
+
+        count_occurence = lambda x: sum(self.word_to_identifier[x].values())  # count occurences of a word
+
+        word_results = OrderedDict()
+
+        matches_min = 3  # ideally we have at least this many alternatives
+        matches_max = 10  # ideally don't much more than this many matches
+        always_accept_this = 1  # values of this edit distance or lower always accepted
+        never_accept_this = 4  # values this edit distance or over always rejected
+
+        # make list of unique words
+        words = OrderedDict()
+        for word in text.split(" "):
+            words[word] = False
+        words = words.keys()
+
+        words = [self.clean_text(word) for word in words]
+
+        for word in words:
+            # first, find possible matches quickly
+            q_grams = self.text_to_positional_q_gram(word)
+            possible_matches = self.find_q_gram_matches(set(q_grams))
+
+            matches = []
+            first_matches = Counter()
+            other_matches = {}
+
+            # now, refine with edit distance
+            for row in possible_matches.itertuples():
+
+                edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this)
+
+                if edit_distance == 0:
+                    continue  # we are looking for alternatives only, not the exact word
+                elif edit_distance <= always_accept_this:
+                    first_matches[row[1]] = count_occurence(row[1])
+                elif edit_distance < never_accept_this:
+                    if not other_matches.get(edit_distance):
+                        other_matches[edit_distance] = Counter()
+                    other_matches[edit_distance][row[1]] = count_occurence(row[1])
+                else:
+                    continue
+
+            # add matches in correct order:
+            for match, _ in first_matches.most_common():
+                matches.append(match)
+            # if we have fewer matches than goal, add more 'less good' matches
+            if len(matches) < matches_min:
+                for i in range(always_accept_this + 1, never_accept_this):
+                    # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives
+                    if new := other_matches.get(i):
+                        prev_num = 10e100
+                        for match, num in new.most_common():
+                            if num == prev_num:
+                                matches.append(match)
+                            elif num != prev_num and len(matches) <= matches_max:
+                                matches.append(match)
+                            else:
+                                break
+                            prev_num = num
+
+            word_results[word] = matches
+        return word_results
+
     def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
         """Overwritten for extra database specific reduction of results.
         """
diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py
index 4439e140a..48de92aa1 100644
--- a/activity_browser/layouts/panes/database_products.py
+++ b/activity_browser/layouts/panes/database_products.py
@@ -60,7 +60,8 @@ def __init__(self, parent, db_name: str):
         self.model.has_external_search = True
         self.model.external_col_name = db_name
 
-        self.search = widgets.ABLineEdit(self)
+        self.search = widgets.MetaDataAutoCompleteLineEdit(self)
+        self.search.database_name = db_name
         self.search.setMaximumHeight(30)
         self.search.setPlaceholderText("Quick Search")
 
diff --git a/activity_browser/ui/widgets/__init__.py b/activity_browser/ui/widgets/__init__.py
index f8c0c439b..333811439 100644
--- a/activity_browser/ui/widgets/__init__.py
+++ b/activity_browser/ui/widgets/__init__.py
@@ -2,7 +2,7 @@
 from .comparison_switch import SwitchComboBox
 from .cutoff_menu import CutoffMenu
 from .line_edit import (ABLineEdit, SignalledComboEdit, SignalledLineEdit,
-                        SignalledPlainTextEdit)
+                        SignalledPlainTextEdit, MetaDataAutoCompleteLineEdit)
 from .treeview import ABTreeView
 from .item_model import ABItemModel
 from .item import ABAbstractItem, ABBranchItem, ABDataItem
diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 655d269d5..d78c2557b 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -1,8 +1,10 @@
 from qtpy import QtWidgets
-from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance
+from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance, QStringListModel, Qt
 from qtpy.QtGui import QTextFormat
 from qtpy.QtWidgets import QCompleter
 
+from activity_browser.bwutils import AB_metadata
+
 
 class ABLineEdit(QtWidgets.QLineEdit):
     textChangedDebounce: SignalInstance = Signal(str)
@@ -120,3 +122,35 @@ def __init__(self, items: list[str], parent=None):
         super().__init__(parent=parent)
         completer = QCompleter(items, self)
         self.setCompleter(completer)
+
+class MetaDataAutoCompleteLineEdit(ABLineEdit):
+    """Line Edit with MetaDataStore completer attached"""
+    def __init__(self, parent=None):
+        super().__init__(parent=parent)
+        self.database_name = ""
+
+        self.textChanged.connect(self._set_items)
+
+        self.model = QStringListModel()
+        self.completer = QCompleter(self.model)
+        self.completer.setPopup(self.completer.popup())
+        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)
+        self.setCompleter(self.completer)
+
+    def _set_items(self):
+        text = self.text()
+
+        words = text.split(" ")
+        if len(words) == 0:
+            self.model.setStringList([])
+            return
+
+        alternatives = AB_metadata.auto_complete(words[-1], database=self.database_name)
+        alternatives = alternatives[words[-1]][:5]  # allow for max n autocompletes
+        print(alternatives)
+
+        items = []
+        for alternative in alternatives:
+            line = " ".join(words[:-1] + [alternative])
+            items.append(line)
+        self.model.setStringList(items)

From 2b61e161329363e4d1ccf0b44d64c0eadc506a01 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 4 Sep 2025 21:01:33 +0200
Subject: [PATCH 23/47] cache database identifiers for faster results + much
 faster autocomplete

---
 .../bwutils/searchengine/metadata_search.py   | 132 ++++++++++--------
 1 file changed, 71 insertions(+), 61 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index ef6162723..ce8483373 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -12,76 +12,92 @@
 
 
 class MetaDataSearchEngine(SearchEngine):
+    def database_id_manager(self, database):
+        if not hasattr(self, "all_database_ids"):
+            self.all_database_ids = {}
 
-    def auto_complete(self, text: str, database) -> OrderedDict:
-        """Based on spellchecker, make more useful for autocompletions
-        """
-        if database is not None:
+        if database_ids := self.all_database_ids.get(database):
+            self.database_ids = database_ids
+        elif database is not None:
             self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
+            self.all_database_ids[database] = self.database_ids
         else:
             self.database_ids = None
 
+    def reset_database_id_manager(self):
+        del self.all_database_ids
+        del self.database_ids
+
+    def add_identifier(self, data: pd.DataFrame) -> None:
+        super().add_identifier(data)
+        self.reset_database_id_manager()
+
+    def remove_identifier(self, identifier, logging=True) -> None:
+        super().remove_identifier(identifier, logging=logging)
+        self.reset_database_id_manager()
+
+    def change_identifier(self, identifier, data: pd.DataFrame) -> None:
+        super().change_identifier(identifier, data)
+        self.reset_database_id_manager()
+
+    def auto_complete(self, word: str, database) -> OrderedDict:
+        """Based on spellchecker, make more useful for autocompletions
+        """
+        self.database_id_manager(database)
+
         count_occurence = lambda x: sum(self.word_to_identifier[x].values())  # count occurences of a word
 
         word_results = OrderedDict()
 
-        matches_min = 3  # ideally we have at least this many alternatives
-        matches_max = 10  # ideally don't much more than this many matches
-        always_accept_this = 1  # values of this edit distance or lower always accepted
+        matches_min = 2  # ideally we have at least this many alternatives
+        matches_max = 4  # ideally don't much more than this many matches
         never_accept_this = 4  # values this edit distance or over always rejected
 
-        # make list of unique words
-        words = OrderedDict()
-        for word in text.split(" "):
-            words[word] = False
-        words = words.keys()
-
-        words = [self.clean_text(word) for word in words]
+        # first, find possible matches quickly
+        q_grams = self.text_to_positional_q_gram(word)
+        possible_matches = self.find_q_gram_matches(set(q_grams))
 
-        for word in words:
-            # first, find possible matches quickly
-            q_grams = self.text_to_positional_q_gram(word)
-            possible_matches = self.find_q_gram_matches(set(q_grams))
+        matches = []
+        first_matches = Counter()
+        other_matches = {}
 
-            matches = []
-            first_matches = Counter()
-            other_matches = {}
+        # now, refine with edit distance
+        for row in possible_matches.itertuples():
 
-            # now, refine with edit distance
-            for row in possible_matches.itertuples():
+            if len(word) > len(row[1]) or word == row[1]:
+                continue
+            test_word = row[1][:len(word)]
 
-                edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this)
+            edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, len(word)))
 
-                if edit_distance == 0:
-                    continue  # we are looking for alternatives only, not the exact word
-                elif edit_distance <= always_accept_this:
-                    first_matches[row[1]] = count_occurence(row[1])
-                elif edit_distance < never_accept_this:
-                    if not other_matches.get(edit_distance):
-                        other_matches[edit_distance] = Counter()
-                    other_matches[edit_distance][row[1]] = count_occurence(row[1])
-                else:
-                    continue
-
-            # add matches in correct order:
-            for match, _ in first_matches.most_common():
-                matches.append(match)
-            # if we have fewer matches than goal, add more 'less good' matches
-            if len(matches) < matches_min:
-                for i in range(always_accept_this + 1, never_accept_this):
-                    # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives
-                    if new := other_matches.get(i):
-                        prev_num = 10e100
-                        for match, num in new.most_common():
-                            if num == prev_num:
-                                matches.append(match)
-                            elif num != prev_num and len(matches) <= matches_max:
-                                matches.append(match)
-                            else:
-                                break
-                            prev_num = num
+            if edit_distance == 0:
+                first_matches[row[1]] = count_occurence(row[1])
+            elif edit_distance < never_accept_this:
+                if not other_matches.get(edit_distance):
+                    other_matches[edit_distance] = Counter()
+                other_matches[edit_distance][row[1]] = count_occurence(row[1])
+            else:
+                continue
 
-            word_results[word] = matches
+        # add matches in correct order:
+        for match, _ in first_matches.most_common():
+            matches.append(match)
+        # if we have fewer matches than goal, add more 'less good' matches
+        if len(matches) < matches_min:
+            for i in range(1, never_accept_this):
+                # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives
+                if new := other_matches.get(i):
+                    prev_num = 10e100
+                    for match, num in new.most_common():
+                        if num == prev_num:
+                            matches.append(match)
+                        elif num != prev_num and len(matches) <= matches_max:
+                            matches.append(match)
+                        else:
+                            break
+                        prev_num = num
+
+        word_results[word] = matches
         return word_results
 
     def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
@@ -141,10 +157,7 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
             return self.df.index.to_list()
 
         # DATABASE SPECIFIC get the set of ids that is in this database
-        if database is not None:
-            self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
-        else:
-            self.database_ids = None
+        self.database_id_manager(database)
 
         queries = self.build_queries(text)
 
@@ -263,10 +276,7 @@ def search(self, text, database: Optional[str] = None) -> list:
             return self.df.index.to_list()
 
         # get the set of ids that is in this database
-        if database is not None:
-            self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
-        else:
-            self.database_ids = None
+        self.database_id_manager(database)
 
         fuzzy_identifiers = self.fuzzy_search(text, database=database, logging=False)
         if len(fuzzy_identifiers) == 0:

From 64bbcd1082d28c0c8b0a4616043a13c55dfa58f9 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Thu, 4 Sep 2025 22:07:54 +0200
Subject: [PATCH 24/47] Implement proper autocomplete popup

---
 .../bwutils/searchengine/metadata_search.py   | 14 +++---
 activity_browser/ui/widgets/line_edit.py      | 47 ++++++++++++++++---
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index ce8483373..5bb01b2a4 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -40,15 +40,16 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None:
         super().change_identifier(identifier, data)
         self.reset_database_id_manager()
 
-    def auto_complete(self, word: str, database) -> OrderedDict:
+    def auto_complete(self, word: str, database) -> list:
         """Based on spellchecker, make more useful for autocompletions
         """
+        if len(word) <= 2:
+            return []
+
         self.database_id_manager(database)
 
         count_occurence = lambda x: sum(self.word_to_identifier[x].values())  # count occurences of a word
 
-        word_results = OrderedDict()
-
         matches_min = 2  # ideally we have at least this many alternatives
         matches_max = 4  # ideally don't much more than this many matches
         never_accept_this = 4  # values this edit distance or over always rejected
@@ -63,13 +64,11 @@ def auto_complete(self, word: str, database) -> OrderedDict:
 
         # now, refine with edit distance
         for row in possible_matches.itertuples():
-
             if len(word) > len(row[1]) or word == row[1]:
                 continue
-            test_word = row[1][:len(word)]
+            test_word = row[1][:len(word)]  # only find edit distance of first part of word
 
             edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, len(word)))
-
             if edit_distance == 0:
                 first_matches[row[1]] = count_occurence(row[1])
             elif edit_distance < never_accept_this:
@@ -97,8 +96,7 @@ def auto_complete(self, word: str, database) -> OrderedDict:
                             break
                         prev_num = num
 
-        word_results[word] = matches
-        return word_results
+        return matches
 
     def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
         """Overwritten for extra database specific reduction of results.
diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index d78c2557b..6e85ab11b 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -123,34 +123,69 @@ def __init__(self, items: list[str], parent=None):
         completer = QCompleter(items, self)
         self.setCompleter(completer)
 
+
 class MetaDataAutoCompleteLineEdit(ABLineEdit):
     """Line Edit with MetaDataStore completer attached"""
+
+    textChangedAutoCompleteDebounce: SignalInstance = Signal()
+    _debounce_autocomplete_ms = 75
+
     def __init__(self, parent=None):
         super().__init__(parent=parent)
+
+        # debounce timer settings
+        self._debounce_autocomplete_timer = QTimer(self, singleShot=True)
+        # self.textChanged.connect(self._set_autocomplete_debounce)
+        self._debounce_autocomplete_timer.timeout.connect(self._emit_autocomplete_debounce)
+
         self.database_name = ""
 
-        self.textChanged.connect(self._set_items)
+        # trigger autocomplete list update
+        self.textChangedAutoCompleteDebounce.connect(self._set_items)
 
+        # autocompleter settings
         self.model = QStringListModel()
         self.completer = QCompleter(self.model)
-        self.completer.setPopup(self.completer.popup())
-        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)
+        self.popup = self.completer.popup()
+        self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
+        self.popup.setMaximumHeight(20)
+        self.completer.setPopup(self.popup)
+        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)  # allow all items in popup list
+
         self.setCompleter(self.completer)
 
     def _set_items(self):
         text = self.text()
-
+        self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth())
         words = text.split(" ")
         if len(words) == 0:
             self.model.setStringList([])
             return
 
         alternatives = AB_metadata.auto_complete(words[-1], database=self.database_name)
-        alternatives = alternatives[words[-1]][:5]  # allow for max n autocompletes
-        print(alternatives)
+        alternatives = alternatives[:5]  # allow for max n autocompletes
+        print(text, alternatives)
 
         items = []
         for alternative in alternatives:
             line = " ".join(words[:-1] + [alternative])
             items.append(line)
         self.model.setStringList(items)
+        self.completer.complete()
+
+    def _set_autocomplete_debounce(self):
+        self._debounce_autocomplete_timer.setInterval(self._debounce_autocomplete_ms)
+        self._debounce_autocomplete_timer.start()
+
+    def _emit_autocomplete_debounce(self):
+        self.textChangedAutoCompleteDebounce.emit()
+
+    def keyPressEvent(self, event):
+        if event.key() == Qt.Key_Escape:
+            if self.completer.popup().isVisible():
+                self.completer.popup().hide()
+                event.accept()
+                return
+        super().keyPressEvent(event)
+        if event.text().strip():
+            QTimer.singleShot(0, self._set_autocomplete_debounce)

From e76f57c2b417dc9659a8d047a3f6fc65fd40645f Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 5 Sep 2025 13:39:28 +0200
Subject: [PATCH 25/47] suggestions for currently edited word instead of last
 word + better autocomplete menu behaviour

---
 activity_browser/ui/widgets/line_edit.py | 76 ++++++++++--------------
 1 file changed, 31 insertions(+), 45 deletions(-)

diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 6e85ab11b..244fa59c5 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -127,65 +127,51 @@ def __init__(self, items: list[str], parent=None):
 class MetaDataAutoCompleteLineEdit(ABLineEdit):
     """Line Edit with MetaDataStore completer attached"""
 
-    textChangedAutoCompleteDebounce: SignalInstance = Signal()
-    _debounce_autocomplete_ms = 75
-
     def __init__(self, parent=None):
         super().__init__(parent=parent)
-
-        # debounce timer settings
-        self._debounce_autocomplete_timer = QTimer(self, singleShot=True)
-        # self.textChanged.connect(self._set_autocomplete_debounce)
-        self._debounce_autocomplete_timer.timeout.connect(self._emit_autocomplete_debounce)
-
         self.database_name = ""
 
-        # trigger autocomplete list update
-        self.textChangedAutoCompleteDebounce.connect(self._set_items)
-
         # autocompleter settings
         self.model = QStringListModel()
         self.completer = QCompleter(self.model)
         self.popup = self.completer.popup()
         self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
-        self.popup.setMaximumHeight(20)
         self.completer.setPopup(self.popup)
-        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)  # allow all items in popup list
-
+        # allow all items in popup list
+        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)
         self.setCompleter(self.completer)
 
-    def _set_items(self):
-        text = self.text()
-        self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth())
-        words = text.split(" ")
-        if len(words) == 0:
+        # connect textEdited, this only triggers on user input, not Completer input
+        self.textEdited.connect(self._set_items)
+
+    def _set_items(self, text=None):
+        if text is None:
+            text = self.text()
+
+        # find the start and end of the word under the cursor
+        cursor_pos = self.cursorPosition()
+        start = cursor_pos
+        while start > 0 and text[start - 1] != " ":
+            start -= 1
+        end = cursor_pos
+        while end < len(text) and text[end] != " ":
+            end += 1
+        current_word = text[start:end]
+        if not current_word:
             self.model.setStringList([])
             return
 
-        alternatives = AB_metadata.auto_complete(words[-1], database=self.database_name)
-        alternatives = alternatives[:5]  # allow for max n autocompletes
-        print(text, alternatives)
-
+        # get suggestions for the current word
+        alternatives = AB_metadata.auto_complete(current_word, database=self.database_name)
+        alternatives = alternatives[:6]  # at most 6, though we should get ~3 usually
+        # replace the current word with each alternative
         items = []
-        for alternative in alternatives:
-            line = " ".join(words[:-1] + [alternative])
-            items.append(line)
+        for alt in alternatives:
+            new_text = text[:start] + alt + text[end:]
+            items.append(new_text)
+        print(text, items)
+
         self.model.setStringList(items)
-        self.completer.complete()
-
-    def _set_autocomplete_debounce(self):
-        self._debounce_autocomplete_timer.setInterval(self._debounce_autocomplete_ms)
-        self._debounce_autocomplete_timer.start()
-
-    def _emit_autocomplete_debounce(self):
-        self.textChangedAutoCompleteDebounce.emit()
-
-    def keyPressEvent(self, event):
-        if event.key() == Qt.Key_Escape:
-            if self.completer.popup().isVisible():
-                self.completer.popup().hide()
-                event.accept()
-                return
-        super().keyPressEvent(event)
-        if event.text().strip():
-            QTimer.singleShot(0, self._set_autocomplete_debounce)
+        # set correct height now that we have data
+        self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth())
+

From eeed99277a23df1fb2e386f4f579c82edc779caf Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 5 Sep 2025 17:24:17 +0200
Subject: [PATCH 26/47] Improve text cleaning regex + autocomplete deals better
 with key hashes + manage popup height better

---
 activity_browser/bwutils/searchengine/base.py       | 12 +++++++-----
 .../bwutils/searchengine/metadata_search.py         | 13 +++++++++----
 activity_browser/ui/widgets/line_edit.py            |  6 +++++-
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 75b81424f..1cc8235ee 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -43,9 +43,9 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l
         log.debug(f"SearchEngine initializing for {len(df)} items")
 
         # compile regex patterns for cleaning
-        self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"…]")  # for replacing with empty string
-        self.SPACE_PATTERN = re.compile(r"[-−:;/+]")  # for replacing with space
-        self.ONE_SPACE_PATTERN = re.compile(r"\s+")  # for replacing multiple white space with 1 space
+        self.SUB_END_PATTERN = re.compile(r"[,.\"'`)\[\]}\\/\-−_:;+…]+(?=\s|$)")  # remove these from end of word
+        self.SUB_START_PATTERN = re.compile(r"(?:^|\s)[,.\"'`(\[{\\/\-−_:;+]+")  # remove these from start of word
+        self.ONE_SPACE_PATTERN = re.compile(r"\s+")  # remove these multiple whitespaces
 
         self.q = 2  # character length of q grams
         self.base_weight = 10  # base weighting for sorting results
@@ -136,8 +136,10 @@ def update_dict(update_me: dict, new: dict) -> dict:
 
     def clean_text(self, text: str):
         """Clean a string so it doesn't contain weird characters or multiple spaces etc."""
-        text = self.SUB_PATTERN.sub("", text.lower())
-        text = self.SPACE_PATTERN.sub(" ", text)
+        text = text.lower()
+        text = self.SUB_END_PATTERN.sub("", text)
+        text = self.SUB_START_PATTERN.sub(" ", text)
+
         text = self.ONE_SPACE_PATTERN.sub(" ", text).strip()
         return text
 
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 5bb01b2a4..33bdc34e8 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -43,7 +43,7 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None:
     def auto_complete(self, word: str, database) -> list:
         """Based on spellchecker, make more useful for autocompletions
         """
-        if len(word) <= 2:
+        if len(word) <= 1:
             return []
 
         self.database_id_manager(database)
@@ -52,7 +52,7 @@ def auto_complete(self, word: str, database) -> list:
 
         matches_min = 2  # ideally we have at least this many alternatives
         matches_max = 4  # ideally don't much more than this many matches
-        never_accept_this = 4  # values this edit distance or over always rejected
+        never_accept_this = 5  # values this edit distance or over always rejected
 
         # first, find possible matches quickly
         q_grams = self.text_to_positional_q_gram(word)
@@ -61,6 +61,7 @@ def auto_complete(self, word: str, database) -> list:
         matches = []
         first_matches = Counter()
         other_matches = {}
+        probably_keys = []  # if we suspect it's a key hash, dump it at the end of the list
 
         # now, refine with edit distance
         for row in possible_matches.itertuples():
@@ -68,8 +69,11 @@ def auto_complete(self, word: str, database) -> list:
                 continue
             test_word = row[1][:len(word)]  # only find edit distance of first part of word
 
-            edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, len(word)))
-            if edit_distance == 0:
+            edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, (len(word) * (2 / 3))))
+            if len(row[1]) == 32 and edit_distance < never_accept_this:
+                # dump any items that are likely to be keys at the end of the list
+                probably_keys.append(row[1])
+            elif edit_distance == 0:
                 first_matches[row[1]] = count_occurence(row[1])
             elif edit_distance < never_accept_this:
                 if not other_matches.get(edit_distance):
@@ -96,6 +100,7 @@ def auto_complete(self, word: str, database) -> list:
                             break
                         prev_num = num
 
+        matches = matches + probably_keys
         return matches
 
     def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 244fa59c5..9545c5943 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -173,5 +173,9 @@ def _set_items(self, text=None):
 
         self.model.setStringList(items)
         # set correct height now that we have data
-        self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth())
+        max_height = max(
+            20,
+            self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()
+                         )
+        self.popup.setMaximumHeight(max_height)
 

From bba71c7c0075870ff86be321dc1de32a01fa721c Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 5 Sep 2025 17:59:27 +0200
Subject: [PATCH 27/47] better key hash sorting

---
 activity_browser/bwutils/searchengine/base.py |  4 +---
 .../bwutils/searchengine/metadata_search.py   | 19 +++++++++----------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 1cc8235ee..fcee9e4cd 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -524,7 +524,6 @@ def spell_check(self, text: str) -> OrderedDict:
             q_grams = self.text_to_positional_q_gram(word)
             possible_matches = self.find_q_gram_matches(set(q_grams))
 
-            matches = []
             first_matches = Counter()
             other_matches = {}
 
@@ -545,8 +544,7 @@ def spell_check(self, text: str) -> OrderedDict:
                     continue
 
             # add matches in correct order:
-            for match, _ in first_matches.most_common():
-                matches.append(match)
+            matches = [match for match, _ in first_matches.most_common()]
             # if we have fewer matches than goal, add more 'less good' matches
             if len(matches) < matches_min:
                 for i in range(always_accept_this + 1, never_accept_this):
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 33bdc34e8..43ae1d051 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -40,7 +40,7 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None:
         super().change_identifier(identifier, data)
         self.reset_database_id_manager()
 
-    def auto_complete(self, word: str, database) -> list:
+    def auto_complete(self, word: str, database: Optional[str] = None) -> list:
         """Based on spellchecker, make more useful for autocompletions
         """
         if len(word) <= 1:
@@ -53,15 +53,16 @@ def auto_complete(self, word: str, database) -> list:
         matches_min = 2  # ideally we have at least this many alternatives
         matches_max = 4  # ideally don't much more than this many matches
         never_accept_this = 5  # values this edit distance or over always rejected
+        # or max 2/3 of len(word) if less than never_accept_this
+        never_accept_this = int(round(min(never_accept_this, max(1, len(word) * (2 / 3))), 0))
 
         # first, find possible matches quickly
         q_grams = self.text_to_positional_q_gram(word)
         possible_matches = self.find_q_gram_matches(set(q_grams))
 
-        matches = []
         first_matches = Counter()
         other_matches = {}
-        probably_keys = []  # if we suspect it's a key hash, dump it at the end of the list
+        probably_keys = Counter()  # if we suspect it's a key hash, dump it at the end of the list
 
         # now, refine with edit distance
         for row in possible_matches.itertuples():
@@ -69,10 +70,9 @@ def auto_complete(self, word: str, database) -> list:
                 continue
             test_word = row[1][:len(word)]  # only find edit distance of first part of word
 
-            edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, (len(word) * (2 / 3))))
-            if len(row[1]) == 32 and edit_distance < never_accept_this:
-                # dump any items that are likely to be keys at the end of the list
-                probably_keys.append(row[1])
+            edit_distance = self.osa_distance(word, test_word, cutoff=never_accept_this)
+            if len(row[1]) == 32 and edit_distance <= 1:
+                probably_keys[row[1]] = 100 - edit_distance  # keys need to be sorted on edit distance, not on occurence
             elif edit_distance == 0:
                 first_matches[row[1]] = count_occurence(row[1])
             elif edit_distance < never_accept_this:
@@ -83,8 +83,7 @@ def auto_complete(self, word: str, database) -> list:
                 continue
 
         # add matches in correct order:
-        for match, _ in first_matches.most_common():
-            matches.append(match)
+        matches = [match for match, _ in first_matches.most_common()]
         # if we have fewer matches than goal, add more 'less good' matches
         if len(matches) < matches_min:
             for i in range(1, never_accept_this):
@@ -100,7 +99,7 @@ def auto_complete(self, word: str, database) -> list:
                             break
                         prev_num = num
 
-        matches = matches + probably_keys
+        matches = matches + [match for match, _ in probably_keys.most_common()]
         return matches
 
     def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:

From 8e734369b34ddeecd2731f60dd5c01e9ac107eb9 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 5 Sep 2025 19:21:58 +0200
Subject: [PATCH 28/47] better autocomplete performance when many long qgram
 matches

---
 .../bwutils/searchengine/metadata_search.py   | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 43ae1d051..4c776235d 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -43,13 +43,12 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None:
     def auto_complete(self, word: str, database: Optional[str] = None) -> list:
         """Based on spellchecker, make more useful for autocompletions
         """
+        count_occurence = lambda x: sum(self.word_to_identifier[x].values())  # count occurences of a word
         if len(word) <= 1:
             return []
 
         self.database_id_manager(database)
 
-        count_occurence = lambda x: sum(self.word_to_identifier[x].values())  # count occurences of a word
-
         matches_min = 2  # ideally we have at least this many alternatives
         matches_max = 4  # ideally don't much more than this many matches
         never_accept_this = 5  # values this edit distance or over always rejected
@@ -58,7 +57,7 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list:
 
         # first, find possible matches quickly
         q_grams = self.text_to_positional_q_gram(word)
-        possible_matches = self.find_q_gram_matches(set(q_grams))
+        possible_matches = self.find_q_gram_matches(set(q_grams), return_all=True)
 
         first_matches = Counter()
         other_matches = {}
@@ -68,9 +67,8 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list:
         for row in possible_matches.itertuples():
             if len(word) > len(row[1]) or word == row[1]:
                 continue
-            test_word = row[1][:len(word)]  # only find edit distance of first part of word
-
-            edit_distance = self.osa_distance(word, test_word, cutoff=never_accept_this)
+            # find edit distance of same size strings
+            edit_distance = self.osa_distance(word, row[1][:len(word)], cutoff=never_accept_this)
             if len(row[1]) == 32 and edit_distance <= 1:
                 probably_keys[row[1]] = 100 - edit_distance  # keys need to be sorted on edit distance, not on occurence
             elif edit_distance == 0:
@@ -102,7 +100,7 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list:
         matches = matches + [match for match, _ in probably_keys.most_common()]
         return matches
 
-    def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
+    def find_q_gram_matches(self, q_grams: set, return_all: bool = False) -> pd.DataFrame:
         """Overwritten for extra database specific reduction of results.
         """
         n_q_grams = len(q_grams)
@@ -137,10 +135,13 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
         max_q = max(matches["matches"])  # this has the most matching q-grams
 
         # determine how many results we want to keep based on how good our results are
-        min_q = min(max(max_q * 0.32,  # have at least a third of q-grams of best match or...
-                        max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
-                            1)),  # okay just do 1 q-gram if there are no more in the word
-                    max_q)  # never have min_q be over max_q
+        if not return_all:
+            min_q = min(max(max_q * 0.32,  # have at least a third of q-grams of best match or...
+                            max(n_q_grams * 0.5,  # if more, at least half the q-grams in the query word?
+                                1)),  # okay just do 1 q-gram if there are no more in the word
+                        max_q)  # never have min_q be over max_q
+        else:
+            min_q = 0
 
         matches = matches[matches["matches"] >= min_q]
         matches = matches.sort_values(by="matches", ascending=False)

From 2f078595a16278dd7390b0c3679a4db519b32e43 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 5 Sep 2025 21:46:24 +0200
Subject: [PATCH 29/47] resolve bug with removing identifier from searchengine
 leading to breaking search

---
 activity_browser/bwutils/searchengine/base.py |  8 ++++++--
 tests/test_search.py                          | 10 ++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index fcee9e4cd..635d58b20 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -292,7 +292,6 @@ def remove_identifier(self, identifier, logging=True) -> None:
             raise Exception(
                 f"Identifier '{identifier}' does not exist in the search data, cannot remove identifier that do not exist.")
 
-        # remove from df
         self.df = self.df.drop(identifier)
 
         # find words that may need to be removed
@@ -309,10 +308,15 @@ def remove_identifier(self, identifier, logging=True) -> None:
                         # this q_gram is only used in this word,
                         #  remove it
                         del self.q_gram_to_word[q_gram]
+                    elif len(self.q_gram_to_word[q_gram]) > 1:
+                        # this q_gram is used in multiple words, only remove the word from the q_gram
+                        del self.q_gram_to_word[q_gram][word]
 
                 del self.word_to_q_grams[word]
             else:
-                # remove the identifier from the dict
+                # this word is found in multiple identifiers
+                # word_to_q_gram and q_gram_to_word do not need to be changed, the word still exists
+                # remove the identifier the word in word_to_identifier
                 del self.word_to_identifier[word][identifier]
         # finally, remove the identifier
         del self.identifier_to_word[identifier]
diff --git a/tests/test_search.py b/tests/test_search.py
index 6d63c14ee..727870359 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -9,8 +9,8 @@ def data_for_test():
         ["b", "coal production", "something"],
         ["c", "coal production", "coat"],
         ["d", "coal hello production", "something"],
-        ["e", "dont find me", "hello world"],
-        ["f", "coat", "another word"],
+        ["e", "dont zzfind me", "hello world"],
+        ["f", "coat", "zzanother word"],
         ["g", "coalispartofthisword", "things"],
         ["h", "coal", "coal"],
     ],
@@ -199,6 +199,12 @@ def test_search_remove_identifier():
     se.remove_identifier(identifier="a")
     assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"]
 
+    # now search on something only in a column we later remove
+    assert se.search("find") == ["e"]
+    se.remove_identifier(identifier="e")
+    assert se.search("find") == []
+
+
 
 def test_search_change_identifier():
     """Do tests for changing identifier."""

From 6e5d1cbb9813bf9b501865d32648979599edcde2 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 5 Sep 2025 22:47:18 +0200
Subject: [PATCH 30/47] add functionality for adding, changing and removing
 identifiers (except full databases)

---
 activity_browser/bwutils/metadata.py          | 49 ++++++++++++++++---
 activity_browser/bwutils/searchengine/base.py |  2 +-
 .../bwutils/searchengine/metadata_search.py   | 25 ++++++++--
 tests/test_search.py                          |  1 -
 4 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index 70ab0606f..04498e991 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -67,6 +67,12 @@ def __init__(self, parent=None):
         self.moveToThread(application.thread())
         self.connect_signals()
 
+        self.search_engine_whitelist = [
+            "id", "name", "synonyms", "unit", "key", "database",  # generic
+            "CAS number", "categories",  # biosphere specific
+            "product", "reference product", "classifications", "location", "properties"  # activity specific
+        ]
+
     def connect_signals(self):
         signals.project.changed.connect(self.sync)
         signals.node.changed.connect(self.on_node_changed)
@@ -76,11 +82,29 @@ def connect_signals(self):
 
     def on_node_deleted(self, ds):
         try:
-            self.dataframe.drop(ds.key, inplace=True)
+            self.dataframe = self.dataframe.drop(ds.key)
+            self.remove_identifier_from_search_engine(ds)
             self.synced.emit()
         except KeyError:
             pass
 
+    def remove_identifier_from_search_engine(self, ds, reset_db_ids=True, logging=True):
+        data = model_to_dict(ds)
+        identifier = data["id"]
+        if identifier in self.search_engine.database_id_manager(data["database"]):
+            self.search_engine.remove_identifier(identifier, logging=logging)
+            if reset_db_ids:
+                self.search_engine.reset_database_id_manager()
+
+    def remove_identifiers_from_search_engine(self, identifiers):
+        t = time()
+        for identifier in identifiers:
+            self.remove_identifier_from_search_engine(identifier, reset_db_ids=False, logging=False)
+        self.search_engine.reset_database_id_manager()
+        log.debug(f"Search index updated in {time() - t:.2f} seconds "
+                  f"for {len(identifiers)} removed items "
+                  f"({len(self.search_engine.df)} items ({self.search_engine.size_of_index()}) currently).")
+
     def on_node_changed(self, new, old):
         data_raw = model_to_dict(new)
         data = data_raw.pop("data")
@@ -98,13 +122,28 @@ def on_node_changed(self, new, old):
             for col in [col for col in data.columns if col not in self.dataframe.columns]:
                 self.dataframe[col] = pd.NA
             self.dataframe.loc[new.key] = data.loc[new.key]
+            self.change_identifier_in_search_engine(identifier=data.loc[new.key, "id"], data=data.loc[[new.key]])
         elif self.dataframe.empty:  # an activity has been added and the dataframe was empty
             self.dataframe = data
+            self.add_identifier_to_search_engine(data)
         else:  # an activity has been added and needs to be concatenated to existing metadata
             self.dataframe = pd.concat([self.dataframe, data], join="outer")
+            self.add_identifier_to_search_engine(data)
 
         self.thread().eventDispatcher().awake.connect(self._emitSyncLater, Qt.ConnectionType.UniqueConnection)
 
+    def add_identifier_to_search_engine(self, data: pd.DataFrame):
+        search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist))  # intersection becomes columns
+        data = data[search_engine_cols]
+        self.search_engine.add_identifier(data.copy())
+        self.search_engine.reset_database_id_manager()
+
+    def change_identifier_in_search_engine(self, identifier, data: pd.DataFrame):
+        search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist))  # intersection becomes columns
+        data = data[search_engine_cols]
+        self.search_engine.change_identifier(identifier=identifier, data=data.copy())
+        self.search_engine.reset_database_id_manager()
+
     @property
     def databases(self):
         return set(self.dataframe.get("database", []))
@@ -354,13 +393,9 @@ def _unpacker(self, classifications: list, system: str) -> list:
         return system_classifications
 
     def init_search(self):
-        allowed_cols = [
-            "id", "name", "synonyms", "unit", "key", "database",  # generic
-            "CAS number", "categories",  # biosphere specific
-            "product", "reference product", "classifications", "location", "properties"  # activity specific
-        ]
 
-        self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols)
+
+        self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=self.search_engine_whitelist)
 
     def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False):
         return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter)
diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 635d58b20..f0f34261b 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -262,7 +262,7 @@ def add_identifier(self, data: pd.DataFrame) -> None:
         # add cols to new data that are missing
         for col in df_cols:
             if col not in data.columns:
-                data[col] = [""] * len(data)
+                data.loc[:, col] = [""] * len(data)
         # re-order cols, first existing, then new
         df_col_set = set(df_cols)
         new_cols = [col for col in data.columns if col not in self.columns if col not in df_col_set]
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 4c776235d..3e70a3cfd 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -23,17 +23,34 @@ def database_id_manager(self, database):
             self.all_database_ids[database] = self.database_ids
         else:
             self.database_ids = None
+        return self.database_ids
 
     def reset_database_id_manager(self):
-        del self.all_database_ids
-        del self.database_ids
+        if hasattr(self, "all_database_ids"):
+            del self.all_database_ids
+        if hasattr(self, "database_ids"):
+            del self.database_ids
 
     def add_identifier(self, data: pd.DataFrame) -> None:
         super().add_identifier(data)
         self.reset_database_id_manager()
 
-    def remove_identifier(self, identifier, logging=True) -> None:
-        super().remove_identifier(identifier, logging=logging)
+
+    def remove_identifiers(self, identifiers, logging=True) -> None:
+        t = time()
+
+        identifiers = set(identifiers)
+        current_identifiers = set(self.df.index.to_list())
+        identifiers = identifiers | current_identifiers  # only remove identifiers currently in the data
+        if len(identifiers) == 0:
+            return
+
+        for identifier in identifiers:
+            super().remove_identifier(identifier, logging=False)
+
+        if logging:
+            log.debug(f"Search index updated in {time() - t:.2f} seconds "
+                      f"for {len(identifiers)} removed items ({len(self.df)} items ({self.size_of_index()}) currently).")
         self.reset_database_id_manager()
 
     def change_identifier(self, identifier, data: pd.DataFrame) -> None:
diff --git a/tests/test_search.py b/tests/test_search.py
index 727870359..0c40f4340 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -205,7 +205,6 @@ def test_search_remove_identifier():
     assert se.search("find") == []
 
 
-
 def test_search_change_identifier():
     """Do tests for changing identifier."""
     df = data_for_test()

From 0bd672c66a0f94d11b49ec7456e1846fdacaef2a Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Sat, 6 Sep 2025 16:29:52 +0200
Subject: [PATCH 31/47] add functionality for adding and removing full
 databases

---
 activity_browser/bwutils/metadata.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index 04498e991..e8a1523ce 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -88,18 +88,17 @@ def on_node_deleted(self, ds):
         except KeyError:
             pass
 
-    def remove_identifier_from_search_engine(self, ds, reset_db_ids=True, logging=True):
+    def remove_identifier_from_search_engine(self, ds):
         data = model_to_dict(ds)
         identifier = data["id"]
         if identifier in self.search_engine.database_id_manager(data["database"]):
-            self.search_engine.remove_identifier(identifier, logging=logging)
-            if reset_db_ids:
-                self.search_engine.reset_database_id_manager()
+            self.search_engine.remove_identifier(identifier)
+            self.search_engine.reset_database_id_manager()
 
     def remove_identifiers_from_search_engine(self, identifiers):
         t = time()
         for identifier in identifiers:
-            self.remove_identifier_from_search_engine(identifier, reset_db_ids=False, logging=False)
+            self.search_engine.remove_identifier(identifier, logging=False)
         self.search_engine.reset_database_id_manager()
         log.debug(f"Search index updated in {time() - t:.2f} seconds "
                   f"for {len(identifiers)} removed items "
@@ -195,7 +194,10 @@ def sync_databases(self) -> None:
 
         for db_name in [x for x in self.databases if x not in bd.databases]:
             # deleted databases
+            remove_search_engine = self.dataframe[self.dataframe["database"] == db_name]["id"]
             self.dataframe.drop(db_name, level=0, inplace=True)
+            if len(remove_search_engine) > 0:
+                self.remove_identifiers_from_search_engine(remove_search_engine)
             sync = True
 
         for db_name in [x for x in bd.databases if x not in self.databases]:
@@ -208,7 +210,7 @@ def sync_databases(self) -> None:
                 self.dataframe = data
             else:
                 self.dataframe = pd.concat([self.dataframe, data], join="outer")
-
+            self.add_identifier_to_search_engine(data)
             sync = True
 
         if sync:

From 4791c5647addbc212850a1c5346031fd7dd1eaca Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Sun, 7 Sep 2025 09:39:48 +0200
Subject: [PATCH 32/47] improve matching speed after metadata conversion to
 ProductModel

---
 activity_browser/bwutils/searchengine/base.py   | 15 +++++++++++----
 .../layouts/panes/database_products.py          | 17 ++++++++---------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index f0f34261b..4bcc3d45d 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -488,7 +488,7 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame:
 
         return matches.iloc[:min(len(matches), 2500), :]  # return at most this many results
 
-    def spell_check(self, text: str) -> OrderedDict:
+    def spell_check(self, text: str, skip_len=1) -> OrderedDict:
         """Create an OrderedDict of each word in the text (space separated)
         with as values possible alternatives.
 
@@ -524,6 +524,13 @@ def spell_check(self, text: str) -> OrderedDict:
         words = [self.clean_text(word) for word in words]
 
         for word in words:
+            if len(word) <= skip_len:  # dont look for alternatives for text this short
+                word_results[word] = []
+                continue
+
+            # reduce acceptable edit distance with short words
+            dont_accept = int(round(max(1, min((len(word) * 0.66), never_accept_this)), 0))
+
             # first, find possible matches quickly
             q_grams = self.text_to_positional_q_gram(word)
             possible_matches = self.find_q_gram_matches(set(q_grams))
@@ -534,13 +541,13 @@ def spell_check(self, text: str) -> OrderedDict:
             # now, refine with edit distance
             for row in possible_matches.itertuples():
 
-                edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this)
+                edit_distance = self.osa_distance(word, row[1], cutoff=dont_accept)
 
                 if edit_distance == 0:
                     continue  # we are looking for alternatives only, not the exact word
                 elif edit_distance <= always_accept_this:
                     first_matches[row[1]] = count_occurence(row[1])
-                elif edit_distance < never_accept_this:
+                elif edit_distance < dont_accept:
                     if not other_matches.get(edit_distance):
                         other_matches[edit_distance] = Counter()
                     other_matches[edit_distance][row[1]] = count_occurence(row[1])
@@ -551,7 +558,7 @@ def spell_check(self, text: str) -> OrderedDict:
             matches = [match for match, _ in first_matches.most_common()]
             # if we have fewer matches than goal, add more 'less good' matches
             if len(matches) < matches_min:
-                for i in range(always_accept_this + 1, never_accept_this):
+                for i in range(always_accept_this + 1, dont_accept):
                     # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives
                     if new := other_matches.get(i):
                         prev_num = 10e100
diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py
index 48de92aa1..680f4d2eb 100644
--- a/activity_browser/layouts/panes/database_products.py
+++ b/activity_browser/layouts/panes/database_products.py
@@ -493,19 +493,18 @@ def external_search(self, query):
         df = AB_metadata.dataframe[AB_metadata.dataframe["id"].isin(result_ids)].loc[:, ["id", "key"]]
         df = df.set_index("key", drop=True)
         translate_dict = df.to_dict()["id"]
+        result_keys = set(translate_dict.keys())
 
         # convert the metadata id scores to row id scores
         row_scores = Counter()
-        df = self.dataframe.copy()
-        act_idx = set(df[df["activity_key"].isin(translate_dict.keys())].index.to_list())
-        prd_idx = set(df[df["product_key"].isin(translate_dict.keys())].index.to_list())
-        indices = act_idx | prd_idx  # combine the two sets ('|' is a set union)
-        # iterate over the indices
-        for index in indices:
-            act_score = results.get(translate_dict.get(df.loc[index, "activity_key"]), 0)
-            prd_score = results.get(translate_dict.get(df.loc[index, "product_key"]), 0)
-            row_scores[index] = act_score + prd_score
+        match_df = self.dataframe[self.dataframe["activity_key"].isin(result_keys) | self.dataframe["product_key"].isin(result_keys)]
+        match_df = match_df.loc[:, ["activity_key", "product_key"]]
+        for row in match_df.itertuples():
+            act_score = results.get(row[1], 0)
+            prd_score = results.get(row[2], 0)
+            row_scores[row[0]] = act_score + prd_score
 
         # finally only return the indices
         sorted_indices = [identifier[0] for identifier in row_scores.most_common()]
+
         return sorted_indices

From 532cac268a40e7d66f5ad2e11e7114424a99598c Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Sun, 7 Sep 2025 18:42:12 +0200
Subject: [PATCH 33/47] make autocomplete suggestions aware of context of other
 words in query, improving usefulness

---
 activity_browser/bwutils/searchengine/base.py |  2 +-
 .../bwutils/searchengine/metadata_search.py   | 37 +++++++++++++++----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 4bcc3d45d..ca36e452a 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -753,7 +753,7 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list:
         if return_counter:
             return all_identifiers
         # now sort on highest weights and make list type
-        sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
+        sorted_identifiers = [identifier for identifier, _ in all_identifiers.most_common()]
         return sorted_identifiers
 
     def search(self, text) -> list:
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 3e70a3cfd..8e55d27cc 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -57,20 +57,43 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None:
         super().change_identifier(identifier, data)
         self.reset_database_id_manager()
 
-    def auto_complete(self, word: str, database: Optional[str] = None) -> list:
+    def auto_complete(self, word: str, context: Optional[set] = set(), database: Optional[str] = None) -> list:
         """Based on spellchecker, make more useful for autocompletions
         """
-        count_occurence = lambda x: sum(self.word_to_identifier[x].values())  # count occurences of a word
+        def word_to_identifier_to_word(check_word):
+            # assumes context words are correctly spelled
+            if len(context) == 0:
+                return 1
+            multiplier = 1
+            for identifier in self.word_to_identifier[check_word]:
+                for context_word in context:
+                    for spell_checked_context_word in spell_checked_context[context_word]:
+                        if spell_checked_context_word in self.identifier_to_word[identifier]:
+                            multiplier += 1
+                    if context_word not in self.word_to_identifier.keys():
+                        continue
+                    if context_word in self.identifier_to_word[identifier]:
+                        multiplier += 3
+            return multiplier
+
+        # count occurrences of a word, count double so word_to_identifier_to_word will never multiply by 1
+        count_occurrence = lambda x: sum(self.word_to_identifier[x].values()) * 2
+
         if len(word) <= 1:
             return []
 
         self.database_id_manager(database)
 
+        if len(context) > 0:
+            spell_checked_context = {}
+            for context_word in context:
+                spell_checked_context[context_word] = self.spell_check(context_word)[context_word][:5]
+
         matches_min = 2  # ideally we have at least this many alternatives
         matches_max = 4  # ideally don't much more than this many matches
-        never_accept_this = 5  # values this edit distance or over always rejected
+        never_accept_this = 4  # values this edit distance or over always rejected
         # or max 2/3 of len(word) if less than never_accept_this
-        never_accept_this = int(round(min(never_accept_this, max(1, len(word) * (2 / 3))), 0))
+        never_accept_this = int(round(max(1, min((len(word) * 0.66), never_accept_this)), 0))
 
         # first, find possible matches quickly
         q_grams = self.text_to_positional_q_gram(word)
@@ -89,11 +112,11 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list:
             if len(row[1]) == 32 and edit_distance <= 1:
                 probably_keys[row[1]] = 100 - edit_distance  # keys need to be sorted on edit distance, not on occurence
             elif edit_distance == 0:
-                first_matches[row[1]] = count_occurence(row[1])
-            elif edit_distance < never_accept_this:
+                first_matches[row[1]] = count_occurrence(row[1]) * word_to_identifier_to_word(row[1])
+            elif edit_distance < never_accept_this and len(first_matches) < matches_min:
                 if not other_matches.get(edit_distance):
                     other_matches[edit_distance] = Counter()
-                other_matches[edit_distance][row[1]] = count_occurence(row[1])
+                other_matches[edit_distance][row[1]] = count_occurrence(row[1]) * word_to_identifier_to_word(row[1])
             else:
                 continue
 

From 42c359306a7251d4fc706fff112c75f26feb0844 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Sun, 7 Sep 2025 18:56:41 +0200
Subject: [PATCH 34/47] ProductModel suggestions now include literal matches
 better

---
 activity_browser/bwutils/metadata.py          | 12 +++----
 .../bwutils/searchengine/metadata_search.py   |  2 +-
 .../layouts/panes/database_products.py        | 36 ++++++++++++++-----
 activity_browser/ui/widgets/line_edit.py      |  3 +-
 4 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index e8a1523ce..32afb629b 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -395,19 +395,19 @@ def _unpacker(self, classifications: list, system: str) -> list:
         return system_classifications
 
     def init_search(self):
-
-
         self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=self.search_engine_whitelist)
 
-    def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False):
-        return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter)
+    def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True):
+        # we do fuzzy search as we re-index results (combining products and activities) for database_products table
+        # anyway, so including literal results quite literally is a waste of time at this point
+        return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter, logging=logging)
 
     def search(self, query:str):
         return self.search_engine.search(query)
 
-    def auto_complete(self, word:str, database: Optional[str] = None):
+    def auto_complete(self, word:str, context: Optional[set] = None, database: Optional[str] = None):
         word = self.search_engine.clean_text(word)
-        completions = self.search_engine.auto_complete(word, database)
+        completions = self.search_engine.auto_complete(word, context=context, database=database)
         return completions
 
 AB_metadata = MetaDataStore()
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 8e55d27cc..ff580b88b 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -87,7 +87,7 @@ def word_to_identifier_to_word(check_word):
         if len(context) > 0:
             spell_checked_context = {}
             for context_word in context:
-                spell_checked_context[context_word] = self.spell_check(context_word)[context_word][:5]
+                spell_checked_context[context_word] = self.spell_check(context_word).get(context_word, [])[:5]
 
         matches_min = 2  # ideally we have at least this many alternatives
         matches_max = 4  # ideally don't much more than this many matches
diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py
index 680f4d2eb..caf83aace 100644
--- a/activity_browser/layouts/panes/database_products.py
+++ b/activity_browser/layouts/panes/database_products.py
@@ -485,7 +485,9 @@ def values_from_indices(key: str, indices: list[QtCore.QModelIndex]):
         return values
 
     def external_search(self, query):
-        results = AB_metadata.db_search(query, database=self.external_col_name, return_counter=True)
+        t = time()
+        results = AB_metadata.db_search(query, database=self.external_col_name, return_counter=True, logging=False)
+        t2 = time()
 
         # extract a dict with 'key' as key and 'id' as values from the metadata
         result_ids = set(results.keys())
@@ -496,15 +498,33 @@ def external_search(self, query):
         result_keys = set(translate_dict.keys())
 
         # convert the metadata id scores to row id scores
-        row_scores = Counter()
+        best_row_scores = Counter()
+        remain_row_scores = Counter()
         match_df = self.dataframe[self.dataframe["activity_key"].isin(result_keys) | self.dataframe["product_key"].isin(result_keys)]
-        match_df = match_df.loc[:, ["activity_key", "product_key"]]
+        cols = ["activity_key", "product_key"]
+        cols = cols + [col for col in match_df.columns if col not in cols]
+        match_df = match_df.loc[:, cols]
         for row in match_df.itertuples():
-            act_score = results.get(row[1], 0)
-            prd_score = results.get(row[2], 0)
-            row_scores[row[0]] = act_score + prd_score
+            # score higher if exact words occur
+            act_score = results.get(translate_dict.get(row[1]), 0)
+            prd_score = results.get(translate_dict.get(row[2]), 0)
+            row_text = str(row[1:])
+            for query_word in query.split(" "):
+                if amt := query.count(query_word) > 0 and len(query_word) > 0:
+                    best_row_scores[row[0]] = (act_score + prd_score) * amt
+            if query in row_text:
+                score = (best_row_scores.get(row[0], 0) + act_score + prd_score) * 2
+                best_row_scores[row[0]] = score
+            else:
+                remain_row_scores[row[0]] = act_score + prd_score
 
         # finally only return the indices
-        sorted_indices = [identifier[0] for identifier in row_scores.most_common()]
-
+        best_sorted_indices = [identifier for identifier, _ in best_row_scores.most_common()]
+        remain_sorted_indices = [identifier for identifier, _ in remain_row_scores.most_common()]
+        sorted_indices = best_sorted_indices + remain_sorted_indices
+        log.debug(
+        f"ProductModel search in '{self.external_col_name}' ({len(self.dataframe)} items) "
+        f"found {len(sorted_indices)} ({len(best_sorted_indices)} literal) results "
+        f"for '{query}' in {time() - t:.2f} seconds ({t2 - t:.2f}s actual search, {time() - t2:.2f}s reorder for table)"
+        )
         return sorted_indices
diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 9545c5943..356b5707e 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -160,9 +160,10 @@ def _set_items(self, text=None):
         if not current_word:
             self.model.setStringList([])
             return
+        context = set((text[:start] + text[end:]).split(" "))
 
         # get suggestions for the current word
-        alternatives = AB_metadata.auto_complete(current_word, database=self.database_name)
+        alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name)
         alternatives = alternatives[:6]  # at most 6, though we should get ~3 usually
         # replace the current word with each alternative
         items = []

From 4ec98fb890ce9d484264c0014db05977efb4afae Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Mon, 8 Sep 2025 11:56:04 +0200
Subject: [PATCH 35/47] Update line-edit autocompleter base class

---
 activity_browser/ui/widgets/line_edit.py | 118 +++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 356b5707e..11fd8b793 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -180,3 +180,121 @@ def _set_items(self, text=None):
                          )
         self.popup.setMaximumHeight(max_height)
 
+class ABTextEdit(QtWidgets.QTextEdit):
+    textChangedDebounce: SignalInstance = Signal(str)
+    _debounce_ms = 250
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._debounce_timer = QTimer(self, singleShot=True)
+
+        self.textChanged.connect(self._set_debounce)
+        self._debounce_timer.timeout.connect(self._emit_debounce)
+
+    def _set_debounce(self):
+        self._debounce_timer.setInterval(self._debounce_ms)
+        self._debounce_timer.start()
+
+    def _emit_debounce(self):
+        self.textChangedDebounce.emit(self.toPlainText())
+
+    def debounce(self):
+        return self._debounce_ms
+
+    def setDebounce(self, ms: int):
+        self._debounce_ms = ms
+
+
+class MetaDataAutoCompleteLineEdit(ABTextEdit):
+    """Line Edit with MetaDataStore completer attached"""
+
+    def __init__(self, parent=None):
+        super().__init__(parent=parent)
+        self.database_name = ""
+
+        # autocompleter settings
+        self.model = QStringListModel()
+        self.completer = QCompleter(self.model)
+        self.completer.setWidget(self)
+        self.popup = self.completer.popup()
+        self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
+        self.completer.setPopup(self.popup)
+        # allow all items in popup list
+        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)
+        self.completer.activated.connect(self._insert_auto_complete)
+
+        self.textChanged.connect(self.sanitize_input)
+
+    def sanitize_input(self):
+        text = self.toPlainText()
+        text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text)
+        self.blockSignals(True)
+        self.clear()
+        self.insertPlainText(text)
+        self.blockSignals(False)
+        if len(text) == 0:
+            self.popup.close()
+
+    def _insert_auto_complete(self, completion):
+        self.clear()
+        self.insertPlainText(completion)
+        self.popup.close()
+        self._set_items()
+
+    def _set_items(self):
+        text = self.toPlainText()
+
+        # find the start and end of the word under the cursor
+        cursor_pos = self.textCursor().position()
+        start = cursor_pos
+        while start > 0 and text[start - 1] != " ":
+            start -= 1
+        end = cursor_pos
+        while end < len(text) and text[end] != " ":
+            end += 1
+        current_word = text[start:end]
+        if not current_word:
+            self.model.setStringList([])
+            return
+        context = set((text[:start] + text[end:]).split(" "))
+
+        # get suggestions for the current word
+        alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name)
+        alternatives = alternatives[:6]  # at most 6, though we should get ~3 usually
+        # replace the current word with each alternative
+        items = []
+        for alt in alternatives:
+            new_text = text[:start] + alt + text[end:]
+            items.append(new_text)
+        print(text, items)
+        if len(items) == 0:
+            return
+
+        self.model.setStringList(items)
+        # set correct height now that we have data
+        max_height = max(
+            20,
+            self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()
+                         )
+        self.popup.setMaximumHeight(max_height)
+        self.completer.complete()
+
+    def keyPressEvent(self, event):
+        key = event.key()
+
+        if key in (Qt.Key_Enter, Qt.Key_Return, Qt.Key_Tab):
+            # insert an autocomplete item
+            # capture enter/return/tab key
+            index = self.popup.currentIndex()
+            selected_text = index.data(Qt.DisplayRole)
+            self.completer.activated.emit(selected_text + " ")
+            return
+        elif key in (Qt.Key_Space,):
+            self.popup.close()
+
+        super().keyPressEvent(event)
+
+        # trigger on text input keys
+        if event.text():  # filters out non-text keys like arrows, shift, etc.
+            self._set_items()

From 72e01d1850d446aa879b5f29a062e86b36ed5bbe Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 9 Sep 2025 10:25:10 +0200
Subject: [PATCH 36/47] Add marking of unknown words to search

---
 activity_browser/ui/widgets/line_edit.py | 99 ++++++++++++++++++++----
 1 file changed, 83 insertions(+), 16 deletions(-)

diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 11fd8b793..7095a5f88 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -1,6 +1,6 @@
 from qtpy import QtWidgets
 from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance, QStringListModel, Qt
-from qtpy.QtGui import QTextFormat
+from qtpy.QtGui import QTextFormat, QSyntaxHighlighter, QTextCharFormat, QTextDocument, QTextCursor
 from qtpy.QtWidgets import QCompleter
 
 from activity_browser.bwutils import AB_metadata
@@ -180,6 +180,29 @@ def _set_items(self, text=None):
                          )
         self.popup.setMaximumHeight(max_height)
 
+
+class UnknownWordHighlighter(QSyntaxHighlighter):
+    def __init__(self, parent: QTextDocument, known_words: set):
+        super().__init__(parent)
+        self.known_words = known_words
+
+        # define the format for unknown words
+        self.unknown_format = QTextCharFormat()
+        self.unknown_format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline)
+        self.unknown_format.setUnderlineColor(Qt.red)
+
+    def highlightBlock(self, text: str):
+        if text.startswith("="):
+            return
+        words = text.split()
+        index = 0
+        for word in words:
+            word_len = len(word)
+            if word and word not in self.known_words:
+                self.setFormat(index, word_len, self.unknown_format)
+            index += word_len + 1  # +1 for the space
+
+
 class ABTextEdit(QtWidgets.QTextEdit):
     textChangedDebounce: SignalInstance = Signal(str)
     _debounce_ms = 250
@@ -212,6 +235,7 @@ class MetaDataAutoCompleteLineEdit(ABTextEdit):
     def __init__(self, parent=None):
         super().__init__(parent=parent)
         self.database_name = ""
+        self.auto_complete_word = ""
 
         # autocompleter settings
         self.model = QStringListModel()
@@ -225,27 +249,64 @@ def __init__(self, parent=None):
         self.completer.activated.connect(self._insert_auto_complete)
 
         self.textChanged.connect(self.sanitize_input)
+        self.highlighter = UnknownWordHighlighter(self.document(), set())
+        self.cursorPositionChanged.connect(self._set_items)
 
     def sanitize_input(self):
+        self._debounce_timer.stop()
         text = self.toPlainText()
-        text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text)
-        self.blockSignals(True)
-        self.clear()
-        self.insertPlainText(text)
-        self.blockSignals(False)
+        clean_text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text)
+
+        if clean_text != text:
+            cursor = self.textCursor()
+            position = cursor.position()
+            self.blockSignals(True)
+            self.clear()
+            self.insertPlainText(clean_text)
+            self.blockSignals(False)
+            cursor.setPosition(min(position, len(text)))
+            self.setTextCursor(cursor)
+
+        known_words = set()
+        for identifier in AB_metadata.search_engine.database_id_manager(self.database_name):
+            known_words.update(AB_metadata.search_engine.identifier_to_word[identifier].keys())
+        self.highlighter.known_words = known_words
+
         if len(text) == 0:
             self.popup.close()
+        self._set_debounce()
 
     def _insert_auto_complete(self, completion):
-        self.clear()
-        self.insertPlainText(completion)
+        cursor = self.textCursor()
+        position = cursor.position()
+        text = self.toPlainText()
+
+        start = position
+        while start > 0 and text[start - 1] != " ":
+            start -= 1
+        new_position = start + len(completion) + 1
+
+        # select the word under the cursor
+        cursor.select(QTextCursor.WordUnderCursor)
+        # replace it with the completion
+        cursor.insertText(completion + " ")
+        # set the updated cursor to end of inserted word + space
+        cursor.setPosition(min(new_position, len(text[:start] + completion) + 1))
+        self.setTextCursor(cursor)
+
         self.popup.close()
-        self._set_items()
+        self.auto_complete_word = ""
+        self.model.setStringList([])
 
     def _set_items(self):
         text = self.toPlainText()
+        if text.startswith("="):
+            self.model.setStringList([])
+            self.auto_complete_word = ""
+            self.popup.close()
+            return
 
-        # find the start and end of the word under the cursor
+            # find the start and end of the word under the cursor
         cursor_pos = self.textCursor().position()
         start = cursor_pos
         while start > 0 and text[start - 1] != " ":
@@ -257,8 +318,12 @@ def _set_items(self):
         if not current_word:
             self.model.setStringList([])
             return
-        context = set((text[:start] + text[end:]).split(" "))
+        if self.auto_complete_word == current_word:
+            # avoid unnecessary auto_complete calls if the current word didnt change
+            return
+        self.auto_complete_word = current_word
 
+        context = set((text[:start] + text[end:]).split(" "))
         # get suggestions for the current word
         alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name)
         alternatives = alternatives[:6]  # at most 6, though we should get ~3 usually
@@ -266,9 +331,11 @@ def _set_items(self):
         items = []
         for alt in alternatives:
             new_text = text[:start] + alt + text[end:]
-            items.append(new_text)
-        print(text, items)
+            # items.append(new_text)
+            items.append(alt)
+        print(cursor_pos, text, items)
         if len(items) == 0:
+            self.popup.close()
             return
 
         self.model.setStringList(items)
@@ -287,8 +354,8 @@ def keyPressEvent(self, event):
             # insert an autocomplete item
             # capture enter/return/tab key
             index = self.popup.currentIndex()
-            selected_text = index.data(Qt.DisplayRole)
-            self.completer.activated.emit(selected_text + " ")
+            completion_text = index.data(Qt.DisplayRole)
+            self.completer.activated.emit(completion_text)
             return
         elif key in (Qt.Key_Space,):
             self.popup.close()
@@ -296,5 +363,5 @@ def keyPressEvent(self, event):
         super().keyPressEvent(event)
 
         # trigger on text input keys
-        if event.text():  # filters out non-text keys like arrows, shift, etc.
+        if event.text() or key in (Qt.LeftArrow, Qt.RightArrow):  # filters out non-text keys like arrows, shift, etc.
             self._set_items()

From fbeb4554bd7f304be61ebb50d21671d648497687 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 9 Sep 2025 10:25:37 +0200
Subject: [PATCH 37/47] drop literal search results

---
 activity_browser/bwutils/searchengine/base.py | 13 ++++++------
 .../bwutils/searchengine/metadata_search.py   | 12 +++++------
 .../layouts/panes/database_products.py        | 21 ++++---------------
 3 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index ca36e452a..3d3ffe18c 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -516,13 +516,13 @@ def spell_check(self, text: str, skip_len=1) -> OrderedDict:
         never_accept_this = 4  # values this edit distance or over always rejected
 
         # make list of unique words
+        text = self.clean_text(text)
         words = OrderedDict()
         for word in text.split(" "):
-            words[word] = False
+            if len(word) != 0:
+                words[word] = False
         words = words.keys()
 
-        words = [self.clean_text(word) for word in words]
-
         for word in words:
             if len(word) <= skip_len:  # dont look for alternatives for text this short
                 word_results[word] = []
@@ -703,10 +703,9 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list:
                 # finally, make this a Counter (with each item=1) so we can properly weigh things later
                 query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if
                                  query_to_identifier.get(q_word, False)]
-                if len(query_id_sets) > 0:
-                    query_identifier_set = set.intersection(*query_id_sets)
-                else:
-                    query_identifier_set = set()
+                if len(query_id_sets) == 0:
+                    continue
+                query_identifier_set = set.intersection(*query_id_sets)
                 if len(query_identifier_set) == 0:
                     # there is no match for this combination of query words, skip
                     break
diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index ff580b88b..374ca56e0 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -61,7 +61,6 @@ def auto_complete(self, word: str, context: Optional[set] = set(), database: Opt
         """Based on spellchecker, make more useful for autocompletions
         """
         def word_to_identifier_to_word(check_word):
-            # assumes context words are correctly spelled
             if len(context) == 0:
                 return 1
             multiplier = 1
@@ -73,7 +72,7 @@ def word_to_identifier_to_word(check_word):
                     if context_word not in self.word_to_identifier.keys():
                         continue
                     if context_word in self.identifier_to_word[identifier]:
-                        multiplier += 3
+                        multiplier += 4
             return multiplier
 
         # count occurrences of a word, count double so word_to_identifier_to_word will never multiply by 1
@@ -105,7 +104,7 @@ def word_to_identifier_to_word(check_word):
 
         # now, refine with edit distance
         for row in possible_matches.itertuples():
-            if len(word) > len(row[1]) or word == row[1]:
+            if word == row[1]:
                 continue
             # find edit distance of same size strings
             edit_distance = self.osa_distance(word, row[1][:len(word)], cutoff=never_accept_this)
@@ -253,10 +252,9 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
                 # finally, make this a Counter (with each item=1) so we can properly weigh things later
                 query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if
                                  query_to_identifier.get(q_word, False)]
-                if len(query_id_sets) > 0:
-                    query_identifier_set = set.intersection(*query_id_sets)
-                else:
-                    query_identifier_set = set()
+                if len(query_id_sets) == 0:
+                    continue
+                query_identifier_set = set.intersection(*query_id_sets)
                 if len(query_identifier_set) == 0:
                     # there is no match for this combination of query words, skip
                     break
diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py
index caf83aace..86228490a 100644
--- a/activity_browser/layouts/panes/database_products.py
+++ b/activity_browser/layouts/panes/database_products.py
@@ -498,33 +498,20 @@ def external_search(self, query):
         result_keys = set(translate_dict.keys())
 
         # convert the metadata id scores to row id scores
-        best_row_scores = Counter()
-        remain_row_scores = Counter()
+        row_scores = Counter()
         match_df = self.dataframe[self.dataframe["activity_key"].isin(result_keys) | self.dataframe["product_key"].isin(result_keys)]
         cols = ["activity_key", "product_key"]
-        cols = cols + [col for col in match_df.columns if col not in cols]
         match_df = match_df.loc[:, cols]
         for row in match_df.itertuples():
-            # score higher if exact words occur
             act_score = results.get(translate_dict.get(row[1]), 0)
             prd_score = results.get(translate_dict.get(row[2]), 0)
-            row_text = str(row[1:])
-            for query_word in query.split(" "):
-                if amt := query.count(query_word) > 0 and len(query_word) > 0:
-                    best_row_scores[row[0]] = (act_score + prd_score) * amt
-            if query in row_text:
-                score = (best_row_scores.get(row[0], 0) + act_score + prd_score) * 2
-                best_row_scores[row[0]] = score
-            else:
-                remain_row_scores[row[0]] = act_score + prd_score
+            row_scores[row[0]] = act_score + prd_score
 
         # finally only return the indices
-        best_sorted_indices = [identifier for identifier, _ in best_row_scores.most_common()]
-        remain_sorted_indices = [identifier for identifier, _ in remain_row_scores.most_common()]
-        sorted_indices = best_sorted_indices + remain_sorted_indices
+        sorted_indices = [identifier for identifier, _ in row_scores.most_common()]
         log.debug(
         f"ProductModel search in '{self.external_col_name}' ({len(self.dataframe)} items) "
-        f"found {len(sorted_indices)} ({len(best_sorted_indices)} literal) results "
+        f"found {len(sorted_indices)} results "
         f"for '{query}' in {time() - t:.2f} seconds ({t2 - t:.2f}s actual search, {time() - t2:.2f}s reorder for table)"
         )
         return sorted_indices

From 59e8e188066011894438dcf99e808cacd755af89 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 9 Sep 2025 12:23:40 +0200
Subject: [PATCH 38/47] marginal speed increases for initializing/updating for
 base class

---
 activity_browser/bwutils/searchengine/base.py | 54 ++++++++++---------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 3d3ffe18c..a6292c874 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -1,7 +1,7 @@
 from itertools import permutations, chain
 import itertools
 import functools
-from collections import Counter, OrderedDict
+from collections import Counter, OrderedDict, defaultdict
 from logging import getLogger
 from time import time
 from typing import Iterable, Optional
@@ -99,11 +99,17 @@ def update_index(self, update_df: pd.DataFrame) -> None:
 
         def update_dict(update_me: dict, new: dict) -> dict:
             """Update a dict of counters with new dict of counters."""
-            for dict_key, _counter in new.items():
-                if dict_key in update_me:
-                    update_me[dict_key].update(_counter)
-                else:
-                    update_me[dict_key] = _counter
+            # set to empty set if we know update_me is empty, otherwise, find set intersection
+            update_keys = set() if len(update_me) == 0 else new.keys() & update_me.keys()
+            if len(update_keys) == 0:
+                new_data = new
+            else:
+                for update_key in update_keys:
+                    update_me[update_key].update(new[update_key])
+                new_data = {key: value for key, value in new.items() if key not in update_keys}
+            # finally add any completely new data
+            # update_me.update(new_data)
+            update_me = update_me | new_data
             return update_me
 
         t = time()
@@ -112,8 +118,10 @@ def update_dict(update_me: dict, new: dict) -> dict:
         # identifier to word and df
         i2w, update_df = self.words_in_df(update_df)
         self.identifier_to_word = update_dict(self.identifier_to_word, i2w)
+        for col in [col for col in update_df.columns if col not in self.df]:
+            col_data = [""] * len(self.df)
+            self.df[col] = col_data
         self.df = pd.concat([self.df, update_df])
-        self.df = self.df.fillna("")  # ensure we don't add unwanted NA through concatenations
 
         # word to identifier
         w2i = self.reverse_dict_many_to_one(i2w)
@@ -126,7 +134,6 @@ def update_dict(update_me: dict, new: dict) -> dict:
         # q-gram to word
         q2w = self.reverse_dict_many_to_one(w2q)
         self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w)
-
         size_new = len(self.df)
         size_dif = size_new - size_old
         size_msg = (f"{size_dif} changed items at {int(round(size_dif/(time() - t), 0))} items/sec "
@@ -153,13 +160,12 @@ def text_to_positional_q_gram(self, text: str) -> list:
         Note: these are technically _positional_ q-grams, but we don't use their positions currently.
         """
         q = self.q
-
+        n = len(text)
         # just return a single-item list if the text is equal or shorter than q
         # else, generate q-grams
-        if len(text) <= q:
+        if n <= q:
             return [text]
-        else:
-            return [text[i:i + q] for i in range(len(text) - q + 1)]
+        return list(text[i:i + q] for i in range(n - q + 1))
 
     def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
         """Return a dict of {identifier: word} for df."""
@@ -176,39 +182,37 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
             col.append(line)
             identifier_word_dict[row[0]] = Counter(line.split(" "))
         return_df["query_col"] = col
+        return_df = return_df.fillna("")   # ensure we don't add unwanted NA in new data
 
         return identifier_word_dict, return_df
 
     def reverse_dict_many_to_one(self, dictionary: dict) -> dict:
         """Reverse a dictionary of Counter objects."""
-        reverse = {}
+        reverse = defaultdict(Counter)
         for identifier, counter_object in dictionary.items():
             for countable, count in counter_object.items():
-                if countable not in reverse:
-                    reverse[countable] = Counter()
                 reverse[countable][identifier] += count
-        return reverse
+        return dict(reverse)
 
     def list_to_q_grams(self, word_list: Iterable) -> dict:
         """Convert a list of unique words to a dict with Counter objects.
 
         Number will be the occurrences of that q-gram in that word.
 
-        q_gram_dict = {
+        return = {
             "word": Counter(
                 "wo": 1
                 "or": 1
                 "rd": 1
-                )
+                ),
+            ...
             }
-
         """
-        q_gram_dict = {}
-
-        for word in word_list:
-            q_gram_dict[word] = Counter(self.text_to_positional_q_gram(word))
-
-        return q_gram_dict
+        text_to_q_gram = self.text_to_positional_q_gram
+        return {
+            word: Counter(text_to_q_gram(word))
+            for word in word_list
+        }
 
     def word_in_index(self, word: str) -> bool:
         """Convenience function to check if a single word is in the search index."""

From e04c20e2dfc21699b2c959e15ee2226797b77831 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 9 Sep 2025 14:24:33 +0200
Subject: [PATCH 39/47] marginal speed increases for initializing/updating for
 base class

---
 activity_browser/bwutils/searchengine/base.py | 27 +++++++++----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index a6292c874..91cc64e12 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -116,12 +116,15 @@ def update_dict(update_me: dict, new: dict) -> dict:
         size_old = len(self.df)
 
         # identifier to word and df
+        t2 = time()
         i2w, update_df = self.words_in_df(update_df)
+        log.debug(f">>> DF {time() - t2:.2f}.")
         self.identifier_to_word = update_dict(self.identifier_to_word, i2w)
         for col in [col for col in update_df.columns if col not in self.df]:
             col_data = [""] * len(self.df)
             self.df[col] = col_data
         self.df = pd.concat([self.df, update_df])
+        log.debug(f">>> tot {time() - t2:.2f}.")
 
         # word to identifier
         w2i = self.reverse_dict_many_to_one(i2w)
@@ -170,21 +173,15 @@ def text_to_positional_q_gram(self, text: str) -> list:
     def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
         """Return a dict of {identifier: word} for df."""
 
-        df = df if any(df) else self.df
-        return_df = df.copy()
-
-        df = df.iloc[:, self.searchable_columns]
-        identifier_word_dict = {}
-        col = []
-
-        for row in df.itertuples(index=True):
-            line = self.clean_text(" | ".join(row[1:]))
-            col.append(line)
-            identifier_word_dict[row[0]] = Counter(line.split(" "))
-        return_df["query_col"] = col
-        return_df = return_df.fillna("")   # ensure we don't add unwanted NA in new data
-
-        return identifier_word_dict, return_df
+        df = df if df is not None else self.df.copy()
+        df = df.fillna("")  # avoid nan
+        # assemble query_col
+        df["query_col"] = df.iloc[:, self.searchable_columns].astype(str).agg(" | ".join, axis=1)
+        # clean all text at once using vectorized operations
+        df["query_col"] = df["query_col"].apply(self.clean_text)
+        # build the identifier_word_dict dictionary
+        identifier_word_dict = df["query_col"].apply(lambda text: Counter(text.split(" "))).to_dict()
+        return identifier_word_dict, df
 
     def reverse_dict_many_to_one(self, dictionary: dict) -> dict:
         """Reverse a dictionary of Counter objects."""

From 1bedc53ff507a8952702736d2688569ac87dc3e9 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 9 Sep 2025 17:12:09 +0200
Subject: [PATCH 40/47] Implement multiprocessing to increase speed for text
 cleaning during indexing.

---
 activity_browser/bwutils/searchengine/base.py | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 91cc64e12..5a7752e3a 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -3,6 +3,8 @@
 import functools
 from collections import Counter, OrderedDict, defaultdict
 from logging import getLogger
+import math
+import multiprocessing as mp
 from time import time
 from typing import Iterable, Optional
 import pandas as pd
@@ -114,26 +116,16 @@ def update_dict(update_me: dict, new: dict) -> dict:
 
         t = time()
         size_old = len(self.df)
-
         # identifier to word and df
-        t2 = time()
         i2w, update_df = self.words_in_df(update_df)
-        log.debug(f">>> DF {time() - t2:.2f}.")
         self.identifier_to_word = update_dict(self.identifier_to_word, i2w)
-        for col in [col for col in update_df.columns if col not in self.df]:
-            col_data = [""] * len(self.df)
-            self.df[col] = col_data
         self.df = pd.concat([self.df, update_df])
-        log.debug(f">>> tot {time() - t2:.2f}.")
-
         # word to identifier
         w2i = self.reverse_dict_many_to_one(i2w)
         self.word_to_identifier = update_dict(self.word_to_identifier, w2i)
-
         # word to q-gram
         w2q = self.list_to_q_grams(w2i.keys())
         self.word_to_q_grams = update_dict(self.word_to_q_grams, w2q)
-
         # q-gram to word
         q2w = self.reverse_dict_many_to_one(w2q)
         self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w)
@@ -170,6 +162,38 @@ def text_to_positional_q_gram(self, text: str) -> list:
             return [text]
         return list(text[i:i + q] for i in range(n - q + 1))
 
+    def df_clean_worker(self, df):
+        """Clean the text in query_col."""
+        df["query_col"] = df["query_col"].apply(self.clean_text)
+        return df
+
+    def df_clean(self, df):
+        """Clean the text in query_col.
+
+        apply multi-processing when the computer is able and its relevant
+        """
+        def chunk_dataframe(df: pd.DataFrame, chunk_size: int):
+            """Split DataFrame into chunks of specified size."""
+            return [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
+
+        max_cores = max(1, mp.cpu_count() - 1)  # leave at least 1 core for other processes
+        min_chunk_size = 2500
+        if max_cores > 1 and len(df) > min_chunk_size * 2:
+            for i in range(max_cores, 0, -1):
+                chunk_size = int(math.ceil(len(df) / i))
+                if chunk_size >= min_chunk_size:
+                    break
+            use_cores = i
+        else:
+            use_cores = 1
+        if use_cores == 1:
+            return self.df_clean_worker(df)
+
+        chunks = chunk_dataframe(df, chunk_size)
+        with mp.Pool(processes=use_cores) as pool:
+            results = pool.starmap(self.df_clean_worker, [(chunk,) for chunk in chunks])
+        return pd.concat(results)
+
     def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
         """Return a dict of {identifier: word} for df."""
 
@@ -178,7 +202,7 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]:
         # assemble query_col
         df["query_col"] = df.iloc[:, self.searchable_columns].astype(str).agg(" | ".join, axis=1)
         # clean all text at once using vectorized operations
-        df["query_col"] = df["query_col"].apply(self.clean_text)
+        df["query_col"] = self.df_clean(df.loc[:, ["query_col"]])
         # build the identifier_word_dict dictionary
         identifier_word_dict = df["query_col"].apply(lambda text: Counter(text.split(" "))).to_dict()
         return identifier_word_dict, df

From 169a7cbe30690c004f179e76f50b9eb8dae5ea66 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 9 Sep 2025 17:12:39 +0200
Subject: [PATCH 41/47] Fix bug with incorrect text length settings

---
 activity_browser/ui/widgets/line_edit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 7095a5f88..9414fa878 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -264,7 +264,7 @@ def sanitize_input(self):
             self.clear()
             self.insertPlainText(clean_text)
             self.blockSignals(False)
-            cursor.setPosition(min(position, len(text)))
+            cursor.setPosition(min(position, len(clean_text)))
             self.setTextCursor(cursor)
 
         known_words = set()
@@ -317,6 +317,7 @@ def _set_items(self):
         current_word = text[start:end]
         if not current_word:
             self.model.setStringList([])
+            self.popup.close()
             return
         if self.auto_complete_word == current_word:
             # avoid unnecessary auto_complete calls if the current word didnt change

From 7efab029f392ec3fa81f8207b03162ad27048080 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 9 Sep 2025 18:25:16 +0200
Subject: [PATCH 42/47] Fix to allow testing of metadatastore

---
 activity_browser/bwutils/metadata.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py
index 32afb629b..6f96814fa 100644
--- a/activity_browser/bwutils/metadata.py
+++ b/activity_browser/bwutils/metadata.py
@@ -89,6 +89,8 @@ def on_node_deleted(self, ds):
             pass
 
     def remove_identifier_from_search_engine(self, ds):
+        if not hasattr(self, "search_engine"):
+            return
         data = model_to_dict(ds)
         identifier = data["id"]
         if identifier in self.search_engine.database_id_manager(data["database"]):
@@ -96,6 +98,8 @@ def remove_identifier_from_search_engine(self, ds):
             self.search_engine.reset_database_id_manager()
 
     def remove_identifiers_from_search_engine(self, identifiers):
+        if not hasattr(self, "search_engine"):
+            return
         t = time()
         for identifier in identifiers:
             self.search_engine.remove_identifier(identifier, logging=False)
@@ -132,12 +136,16 @@ def on_node_changed(self, new, old):
         self.thread().eventDispatcher().awake.connect(self._emitSyncLater, Qt.ConnectionType.UniqueConnection)
 
     def add_identifier_to_search_engine(self, data: pd.DataFrame):
+        if not hasattr(self, "search_engine"):
+            return
         search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist))  # intersection becomes columns
         data = data[search_engine_cols]
         self.search_engine.add_identifier(data.copy())
         self.search_engine.reset_database_id_manager()
 
     def change_identifier_in_search_engine(self, identifier, data: pd.DataFrame):
+        if not hasattr(self, "search_engine"):
+            return
         search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist))  # intersection becomes columns
         data = data[search_engine_cols]
         self.search_engine.change_identifier(identifier=identifier, data=data.copy())

From 06747b839c94ac78a11c0d847a028c5c9562c92f Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 12 Sep 2025 17:02:08 +0200
Subject: [PATCH 43/47] Refactor textedit to proper location

---
 .../layouts/panes/database_products.py        |   2 +-
 activity_browser/ui/widgets/line_edit.py      | 260 +-----------------
 activity_browser/ui/widgets/text_edit.py      | 251 +++++++++++++++++
 3 files changed, 254 insertions(+), 259 deletions(-)
 create mode 100644 activity_browser/ui/widgets/text_edit.py

diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py
index 86228490a..475824266 100644
--- a/activity_browser/layouts/panes/database_products.py
+++ b/activity_browser/layouts/panes/database_products.py
@@ -60,7 +60,7 @@ def __init__(self, parent, db_name: str):
         self.model.has_external_search = True
         self.model.external_col_name = db_name
 
-        self.search = widgets.MetaDataAutoCompleteLineEdit(self)
+        self.search = widgets.MetaDataAutoCompleteTextEdit(self)
         self.search.database_name = db_name
         self.search.setMaximumHeight(30)
         self.search.setPlaceholderText("Quick Search")
diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py
index 9414fa878..427663938 100644
--- a/activity_browser/ui/widgets/line_edit.py
+++ b/activity_browser/ui/widgets/line_edit.py
@@ -1,9 +1,6 @@
 from qtpy import QtWidgets
-from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance, QStringListModel, Qt
-from qtpy.QtGui import QTextFormat, QSyntaxHighlighter, QTextCharFormat, QTextDocument, QTextCursor
-from qtpy.QtWidgets import QCompleter
-
-from activity_browser.bwutils import AB_metadata
+from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance
+from qtpy.QtGui import QTextFormat
 
 
 class ABLineEdit(QtWidgets.QLineEdit):
@@ -113,256 +110,3 @@ def focusOutEvent(self, event):
             self._before = after
             actions.ActivityModify.run(self._key, self._field, after)
         super(SignalledComboEdit, self).focusOutEvent(event)
-
-
-class AutoCompleteLineEdit(QtWidgets.QLineEdit):
-    """Line Edit with a completer attached"""
-
-    def __init__(self, items: list[str], parent=None):
-        super().__init__(parent=parent)
-        completer = QCompleter(items, self)
-        self.setCompleter(completer)
-
-
-class MetaDataAutoCompleteLineEdit(ABLineEdit):
-    """Line Edit with MetaDataStore completer attached"""
-
-    def __init__(self, parent=None):
-        super().__init__(parent=parent)
-        self.database_name = ""
-
-        # autocompleter settings
-        self.model = QStringListModel()
-        self.completer = QCompleter(self.model)
-        self.popup = self.completer.popup()
-        self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
-        self.completer.setPopup(self.popup)
-        # allow all items in popup list
-        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)
-        self.setCompleter(self.completer)
-
-        # connect textEdited, this only triggers on user input, not Completer input
-        self.textEdited.connect(self._set_items)
-
-    def _set_items(self, text=None):
-        if text is None:
-            text = self.text()
-
-        # find the start and end of the word under the cursor
-        cursor_pos = self.cursorPosition()
-        start = cursor_pos
-        while start > 0 and text[start - 1] != " ":
-            start -= 1
-        end = cursor_pos
-        while end < len(text) and text[end] != " ":
-            end += 1
-        current_word = text[start:end]
-        if not current_word:
-            self.model.setStringList([])
-            return
-        context = set((text[:start] + text[end:]).split(" "))
-
-        # get suggestions for the current word
-        alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name)
-        alternatives = alternatives[:6]  # at most 6, though we should get ~3 usually
-        # replace the current word with each alternative
-        items = []
-        for alt in alternatives:
-            new_text = text[:start] + alt + text[end:]
-            items.append(new_text)
-        print(text, items)
-
-        self.model.setStringList(items)
-        # set correct height now that we have data
-        max_height = max(
-            20,
-            self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()
-                         )
-        self.popup.setMaximumHeight(max_height)
-
-
-class UnknownWordHighlighter(QSyntaxHighlighter):
-    def __init__(self, parent: QTextDocument, known_words: set):
-        super().__init__(parent)
-        self.known_words = known_words
-
-        # define the format for unknown words
-        self.unknown_format = QTextCharFormat()
-        self.unknown_format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline)
-        self.unknown_format.setUnderlineColor(Qt.red)
-
-    def highlightBlock(self, text: str):
-        if text.startswith("="):
-            return
-        words = text.split()
-        index = 0
-        for word in words:
-            word_len = len(word)
-            if word and word not in self.known_words:
-                self.setFormat(index, word_len, self.unknown_format)
-            index += word_len + 1  # +1 for the space
-
-
-class ABTextEdit(QtWidgets.QTextEdit):
-    textChangedDebounce: SignalInstance = Signal(str)
-    _debounce_ms = 250
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        self._debounce_timer = QTimer(self, singleShot=True)
-
-        self.textChanged.connect(self._set_debounce)
-        self._debounce_timer.timeout.connect(self._emit_debounce)
-
-    def _set_debounce(self):
-        self._debounce_timer.setInterval(self._debounce_ms)
-        self._debounce_timer.start()
-
-    def _emit_debounce(self):
-        self.textChangedDebounce.emit(self.toPlainText())
-
-    def debounce(self):
-        return self._debounce_ms
-
-    def setDebounce(self, ms: int):
-        self._debounce_ms = ms
-
-
-class MetaDataAutoCompleteLineEdit(ABTextEdit):
-    """Line Edit with MetaDataStore completer attached"""
-
-    def __init__(self, parent=None):
-        super().__init__(parent=parent)
-        self.database_name = ""
-        self.auto_complete_word = ""
-
-        # autocompleter settings
-        self.model = QStringListModel()
-        self.completer = QCompleter(self.model)
-        self.completer.setWidget(self)
-        self.popup = self.completer.popup()
-        self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
-        self.completer.setPopup(self.popup)
-        # allow all items in popup list
-        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion)
-        self.completer.activated.connect(self._insert_auto_complete)
-
-        self.textChanged.connect(self.sanitize_input)
-        self.highlighter = UnknownWordHighlighter(self.document(), set())
-        self.cursorPositionChanged.connect(self._set_items)
-
-    def sanitize_input(self):
-        self._debounce_timer.stop()
-        text = self.toPlainText()
-        clean_text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text)
-
-        if clean_text != text:
-            cursor = self.textCursor()
-            position = cursor.position()
-            self.blockSignals(True)
-            self.clear()
-            self.insertPlainText(clean_text)
-            self.blockSignals(False)
-            cursor.setPosition(min(position, len(clean_text)))
-            self.setTextCursor(cursor)
-
-        known_words = set()
-        for identifier in AB_metadata.search_engine.database_id_manager(self.database_name):
-            known_words.update(AB_metadata.search_engine.identifier_to_word[identifier].keys())
-        self.highlighter.known_words = known_words
-
-        if len(text) == 0:
-            self.popup.close()
-        self._set_debounce()
-
-    def _insert_auto_complete(self, completion):
-        cursor = self.textCursor()
-        position = cursor.position()
-        text = self.toPlainText()
-
-        start = position
-        while start > 0 and text[start - 1] != " ":
-            start -= 1
-        new_position = start + len(completion) + 1
-
-        # select the word under the cursor
-        cursor.select(QTextCursor.WordUnderCursor)
-        # replace it with the completion
-        cursor.insertText(completion + " ")
-        # set the updated cursor to end of inserted word + space
-        cursor.setPosition(min(new_position, len(text[:start] + completion) + 1))
-        self.setTextCursor(cursor)
-
-        self.popup.close()
-        self.auto_complete_word = ""
-        self.model.setStringList([])
-
-    def _set_items(self):
-        text = self.toPlainText()
-        if text.startswith("="):
-            self.model.setStringList([])
-            self.auto_complete_word = ""
-            self.popup.close()
-            return
-
-            # find the start and end of the word under the cursor
-        cursor_pos = self.textCursor().position()
-        start = cursor_pos
-        while start > 0 and text[start - 1] != " ":
-            start -= 1
-        end = cursor_pos
-        while end < len(text) and text[end] != " ":
-            end += 1
-        current_word = text[start:end]
-        if not current_word:
-            self.model.setStringList([])
-            self.popup.close()
-            return
-        if self.auto_complete_word == current_word:
-            # avoid unnecessary auto_complete calls if the current word didnt change
-            return
-        self.auto_complete_word = current_word
-
-        context = set((text[:start] + text[end:]).split(" "))
-        # get suggestions for the current word
-        alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name)
-        alternatives = alternatives[:6]  # at most 6, though we should get ~3 usually
-        # replace the current word with each alternative
-        items = []
-        for alt in alternatives:
-            new_text = text[:start] + alt + text[end:]
-            # items.append(new_text)
-            items.append(alt)
-        print(cursor_pos, text, items)
-        if len(items) == 0:
-            self.popup.close()
-            return
-
-        self.model.setStringList(items)
-        # set correct height now that we have data
-        max_height = max(
-            20,
-            self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()
-                         )
-        self.popup.setMaximumHeight(max_height)
-        self.completer.complete()
-
-    def keyPressEvent(self, event):
-        key = event.key()
-
-        if key in (Qt.Key_Enter, Qt.Key_Return, Qt.Key_Tab):
-            # insert an autocomplete item
-            # capture enter/return/tab key
-            index = self.popup.currentIndex()
-            completion_text = index.data(Qt.DisplayRole)
-            self.completer.activated.emit(completion_text)
-            return
-        elif key in (Qt.Key_Space,):
-            self.popup.close()
-
-        super().keyPressEvent(event)
-
-        # trigger on text input keys
-        if event.text() or key in (Qt.LeftArrow, Qt.RightArrow):  # filters out non-text keys like arrows, shift, etc.
-            self._set_items()
diff --git a/activity_browser/ui/widgets/text_edit.py b/activity_browser/ui/widgets/text_edit.py
new file mode 100644
index 000000000..aff4344ae
--- /dev/null
+++ b/activity_browser/ui/widgets/text_edit.py
@@ -0,0 +1,251 @@
+from qtpy import QtWidgets
+from qtpy.QtCore import QTimer, Signal, SignalInstance, QStringListModel, Qt
+from qtpy.QtGui import QSyntaxHighlighter, QTextCharFormat, QTextDocument, QFont
+from qtpy.QtWidgets import QCompleter, QStyledItemDelegate, QStyle
+
+from activity_browser.bwutils import AB_metadata
+
+
+class UnknownWordHighlighter(QSyntaxHighlighter):
+    def __init__(self, parent: QTextDocument, known_words: set):
+        super().__init__(parent)
+        self.known_words = known_words
+
+        # define the format for unknown words
+        self.unknown_format = QTextCharFormat()
+        self.unknown_format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline)
+        self.unknown_format.setUnderlineColor(Qt.red)
+
+    def highlightBlock(self, text: str):
+        if text.startswith("="):
+            return
+        words = text.split()
+        index = 0
+        for word in words:
+            word_len = len(word)
+            if word and word not in self.known_words:
+                self.setFormat(index, word_len, self.unknown_format)
+            index += word_len + 1  # +1 for the space
+
+
+class AutoCompleteDelegate(QStyledItemDelegate):
+    def __init__(self, parent=None, get_bold_word_func=None):
+        super().__init__(parent)
+        self.get_bold_word_func = get_bold_word_func
+
+    def paint(self, painter, option, index):
+        text = index.data(Qt.DisplayRole)
+        bold_words = self.get_bold_word_func()
+        bold_words = {word.lower() for word in bold_words}
+
+        painter.save()
+
+        # Draw selection background if selected
+        if option.state & QStyle.State_Selected:
+            painter.fillRect(option.rect, option.palette.highlight())
+            painter.setPen(option.palette.highlightedText().color())
+        else:
+            painter.setPen(option.palette.text().color())
+
+        # Split text into words and draw each with appropriate font
+        words = text.split(" ")
+        x = option.rect.x()
+        y = option.rect.y()
+        spacing = 4  # space between words
+        font = option.font
+        metrics = painter.fontMetrics()
+
+        for word in words:
+            word_font = QFont(font)
+            if word.lower() in bold_words:
+                word_font.setBold(True)
+            painter.setFont(word_font)
+
+            word_width = metrics.horizontalAdvance(word)
+            painter.drawText(x, y + metrics.ascent() + (option.rect.height() - metrics.height()) // 2, word)
+            x += word_width + spacing
+        painter.restore()
+
+
+class ABTextEdit(QtWidgets.QTextEdit):
+    textChangedDebounce: SignalInstance = Signal(str)
+    _debounce_ms = 250
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._debounce_timer = QTimer(self, singleShot=True)
+
+        self.textChanged.connect(self._set_debounce)
+        self._debounce_timer.timeout.connect(self._emit_debounce)
+
+    def _set_debounce(self):
+        self._debounce_timer.setInterval(self._debounce_ms)
+        self._debounce_timer.start()
+
+    def _emit_debounce(self):
+        self.textChangedDebounce.emit(self.toPlainText())
+
+    def debounce(self):
+        return self._debounce_ms
+
+    def setDebounce(self, ms: int):
+        self._debounce_ms = ms
+
+
+class ABAutoCompleTextEdit(ABTextEdit):
+    def __init__(self, parent=None, highlight_unknown=False):
+        super().__init__(parent=parent)
+        self.auto_complete_word = ""
+        self.auto_complete_suggestions = []
+
+        # autocompleter settings
+        self.model = QStringListModel()
+        self.completer = QCompleter(self.model)
+        self.completer.setWidget(self)
+        self.popup = self.completer.popup()
+        # set custom delegate to bold the current word
+        delegate = AutoCompleteDelegate(self.popup, get_bold_word_func=lambda: self.auto_complete_suggestions)
+        self.popup.setItemDelegate(delegate)
+        self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
+        self.completer.setPopup(self.popup)
+        self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) # allow all items in popup list
+        self.completer.activated.connect(self._insert_auto_complete)
+
+        self.textChanged.connect(self._sanitize_input)
+        if highlight_unknown:
+            self.highlighter = UnknownWordHighlighter(self.document(), set())
+        self.cursorPositionChanged.connect(self._set_autocomplete_items)
+
+    def keyPressEvent(self, event):
+        key = event.key()
+
+        if key in (Qt.Key_Enter, Qt.Key_Return, Qt.Key_Tab):
+            # insert an autocomplete item
+            # capture enter/return/tab key
+            index = self.popup.currentIndex()
+            completion_text = index.data(Qt.DisplayRole)
+            self.completer.activated.emit(completion_text)
+            return
+        elif key in (Qt.Key_Space,):
+            self.popup.close()
+
+        super().keyPressEvent(event)
+
+        # trigger on text input keys
+        if event.text() or key in (Qt.LeftArrow, Qt.RightArrow):  # filters out non-text keys except l/r arrows
+            self._set_autocomplete_items()
+
+    def _sanitize_input(self):
+        raise NotImplementedError
+
+    def _set_autocomplete_items(self):
+        raise NotImplementedError
+
+    def _insert_auto_complete(self, completion):
+        cursor = self.textCursor()
+        position = cursor.position()
+        completion = completion + " "  # add space to end of new text
+
+        # find where to put cursor back
+        new_position = position
+        while new_position < len(completion) and completion[new_position] != " ":
+            new_position += 1
+        new_position += 1  # add one char for space
+
+        # set new text from completion
+        self.blockSignals(True)
+        self.clear()
+        self.setText(completion)
+        # set the cursor location
+        cursor.setPosition(min(new_position, len(completion)))
+        self.setTextCursor(cursor)
+        self.blockSignals(False)
+
+        # house keeping
+        self._emit_debounce()
+        self.popup.close()
+        self.auto_complete_word = ""
+        self.model.setStringList([])
+
+
+class MetaDataAutoCompleteTextEdit(ABAutoCompleTextEdit):
+    """TextEdit with MetaDataStore completer attached."""
+    def __init__(self, parent=None):
+        super().__init__(parent=parent, highlight_unknown=True)
+        self.database_name = ""
+
+    def _sanitize_input(self):
+        self._debounce_timer.stop()
+        text = self.toPlainText()
+        clean_text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text)
+
+        if clean_text != text:
+            cursor = self.textCursor()
+            position = cursor.position()
+            self.blockSignals(True)
+            self.clear()
+            self.insertPlainText(clean_text)
+            self.blockSignals(False)
+            cursor.setPosition(min(position, len(clean_text)))
+            self.setTextCursor(cursor)
+
+        known_words = set()
+        for identifier in AB_metadata.search_engine.database_id_manager(self.database_name):
+            known_words.update(AB_metadata.search_engine.identifier_to_word[identifier].keys())
+        self.highlighter.known_words = known_words
+
+        if len(text) == 0:
+            self.popup.close()
+        self._set_debounce()
+
+    def _set_autocomplete_items(self):
+        text = self.toPlainText()
+        if text.startswith("="):
+            self.model.setStringList([])
+            self.auto_complete_word = ""
+            self.popup.close()
+            return
+
+        # find the start and end of the word under the cursor
+        cursor = self.textCursor()
+        position = cursor.position()
+        start = position
+        while start > 0 and text[start - 1] != " ":
+            start -= 1
+        end = position
+        while end < len(text) and text[end] != " ":
+            end += 1
+        current_word = text[start:end]
+        if not current_word:
+            self.model.setStringList([])
+            self.popup.close()
+            self.auto_complete_word = ""
+            return
+        if self.auto_complete_word == current_word:
+            # avoid unnecessary auto_complete calls if the current word didnt change
+            return
+        self.auto_complete_word = current_word
+
+        context = set((text[:start] + text[end:]).split(" "))
+        # get suggestions for the current word
+        suggestions = AB_metadata.auto_complete(current_word, context=context, database=self.database_name)
+        suggestions = suggestions[:6]  # at most 6, though we should get ~3 usually
+        self.auto_complete_suggestions = suggestions  # set for bolding of autocomplete suggestions
+        # replace the current word with each alternative
+        items = []
+        for alt in suggestions:
+            new_text = text[:start] + alt + text[end:]
+            items.append(new_text)
+        if len(items) == 0:
+            self.popup.close()
+            return
+
+        self.model.setStringList(items)
+        # set correct height now that we have data
+        max_height = max(
+            20,
+            self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()
+                         )
+        self.popup.setMaximumHeight(max_height)
+        self.completer.complete()

From 90583c668e5fcbdcfb1e357cf6ae13a5673352ac Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Fri, 12 Sep 2025 17:02:11 +0200
Subject: [PATCH 44/47] Refactor textedit to proper location

---
 activity_browser/ui/widgets/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/activity_browser/ui/widgets/__init__.py b/activity_browser/ui/widgets/__init__.py
index 333811439..89d2c30ca 100644
--- a/activity_browser/ui/widgets/__init__.py
+++ b/activity_browser/ui/widgets/__init__.py
@@ -1,8 +1,8 @@
 from .abstract_pane import ABAbstractPane
 from .comparison_switch import SwitchComboBox
 from .cutoff_menu import CutoffMenu
-from .line_edit import (ABLineEdit, SignalledComboEdit, SignalledLineEdit,
-                        SignalledPlainTextEdit, MetaDataAutoCompleteLineEdit)
+from .line_edit import ABLineEdit, SignalledComboEdit, SignalledLineEdit, SignalledPlainTextEdit
+from .text_edit import MetaDataAutoCompleteTextEdit
 from .treeview import ABTreeView
 from .item_model import ABItemModel
 from .item import ABAbstractItem, ABBranchItem, ABDataItem

From fecbcf20cb8f4c6bf5f2c267b9fdb20b2ef68ab6 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 16 Sep 2025 11:34:33 +0200
Subject: [PATCH 45/47] Implement search caching for faster results

---
 .../bwutils/searchengine/metadata_search.py   | 143 +++++++++++++++---
 1 file changed, 123 insertions(+), 20 deletions(-)

diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py
index 374ca56e0..1814a3e8a 100644
--- a/activity_browser/bwutils/searchengine/metadata_search.py
+++ b/activity_browser/bwutils/searchengine/metadata_search.py
@@ -12,17 +12,22 @@
 
 
 class MetaDataSearchEngine(SearchEngine):
+
+    # caching for faster operation
     def database_id_manager(self, database):
         if not hasattr(self, "all_database_ids"):
             self.all_database_ids = {}
 
         if database_ids := self.all_database_ids.get(database):
             self.database_ids = database_ids
+            self.current_database = database
         elif database is not None:
             self.database_ids = set(self.df[self.df["database"] == database].index.to_list())
             self.all_database_ids[database] = self.database_ids
+            self.current_database = database
         else:
             self.database_ids = None
+            self.current_database = "_@@NO_DB_"
         return self.database_ids
 
     def reset_database_id_manager(self):
@@ -31,10 +36,54 @@ def reset_database_id_manager(self):
         if hasattr(self, "database_ids"):
             del self.database_ids
 
-    def add_identifier(self, data: pd.DataFrame) -> None:
-        super().add_identifier(data)
+    def database_word_manager(self, database):
+        if not hasattr(self, "all_database_words"):
+            self.all_database_words = {}
+
+        if database_words := self.all_database_words.get(database):
+            self.database_words = database_words
+        elif database is not None:
+            ids = self.database_id_manager(database)
+            self.database_words = self.reverse_dict_many_to_one({_id: self.identifier_to_word[_id] for _id in ids})
+            self.all_database_words[database] = self.database_words
+        else:
+            self.database_words = None
+        return self.database_words
+
+    def reset_database_word_manager(self, database):
+        if hasattr(self, "all_database_words") and self.all_database_words.get(database):
+            del self.all_database_words[database]
+        if hasattr(self, "database_words"):
+            del self.database_words
+
+    def database_search_cache(self, database, query, result = None):
+        if not hasattr(self, "search_cache"):
+            self.search_cache = {}
+
+        if result:
+            if self.search_cache.get(database):
+                self.search_cache[database][query] = result
+            else:
+                self.search_cache[database] = {query: result}
+            return
+        if db_cache := self.search_cache.get(database):
+            if cached_result := db_cache.get(query):
+                return cached_result
+        return
+
+    def reset_search_cache(self, database):
+        if hasattr(self, "search_cache") and self.search_cache.get(database):
+            del self.search_cache[database]
+
+    def reset_all_caches(self, databases):
         self.reset_database_id_manager()
+        for database in databases:
+            self.reset_database_word_manager(database)
+            self.reset_search_cache(database)
 
+    def add_identifier(self, data: pd.DataFrame) -> None:
+        super().add_identifier(data)
+        self.reset_all_caches(data["database"].unique())
 
     def remove_identifiers(self, identifiers, logging=True) -> None:
         t = time()
@@ -42,6 +91,7 @@ def remove_identifiers(self, identifiers, logging=True) -> None:
         identifiers = set(identifiers)
         current_identifiers = set(self.df.index.to_list())
         identifiers = identifiers | current_identifiers  # only remove identifiers currently in the data
+        databases = self.df.loc[identifiers, ["databases"]].unique()  # extract databases for cache cleaning
         if len(identifiers) == 0:
             return
 
@@ -51,11 +101,11 @@ def remove_identifiers(self, identifiers, logging=True) -> None:
         if logging:
             log.debug(f"Search index updated in {time() - t:.2f} seconds "
                       f"for {len(identifiers)} removed items ({len(self.df)} items ({self.size_of_index()}) currently).")
-        self.reset_database_id_manager()
+        self.reset_all_caches(databases)
 
     def change_identifier(self, identifier, data: pd.DataFrame) -> None:
         super().change_identifier(identifier, data)
-        self.reset_database_id_manager()
+        self.reset_all_caches(data["database"].unique())
 
     def auto_complete(self, word: str, context: Optional[set] = set(), database: Optional[str] = None) -> list:
         """Based on spellchecker, make more useful for autocompletions
@@ -188,6 +238,53 @@ def find_q_gram_matches(self, q_grams: set, return_all: bool = False) -> pd.Data
 
         return matches.iloc[:min(len(matches), 2500), :]  # return at most this many results
 
+    def search_size_1(self, queries: list, original_words: set, orig_word_weight=5, exact_word_weight=1) -> dict:
+        """Return a dict of {query_word: Counter(identifier)}.
+
+        queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word
+        original words: a list of words actually searched for (not including spellchecked)
+
+        orig_word_weight: additional weight to add to original words
+        exact_word_weight: additional weight to add to exact word matches (as opposed to be 'in' str)
+
+        First, we find all matching words, creating a dict of words in 'queries' as keys and words matching that query word as list of values
+        Next, we convert this to identifiers and add weights:
+            Weight will be increased if matching 'orig_word_weight' or 'exact_word_weight'
+        """
+        matches = {}
+        t2 = time()
+        # add each word in search index if query_word in word
+        for word in self.database_words.keys():
+            for query in queries:
+                # query is list/tuple of len 1
+                query_word = query[0]  # only use the word
+                if query_word in word:
+                    words = matches.get(query_word, [])
+                    words.extend([word])
+                    matches[query_word] = words
+
+        # now convert matched words to matched identifiers
+        matched_identifiers = {}
+        for word, matching_words in matches.items():
+            if result := self.database_search_cache(self.current_database, word):
+                matched_identifiers[word] = result
+                continue
+            id_counter = matched_identifiers.get(word, Counter())
+            for matched_word in matching_words:
+                weight = self.base_weight
+
+                # add the word n times, where n is the weight, original search word is weighted higher than alternatives
+                if matched_word in original_words:
+                    weight += orig_word_weight  # increase weight for original word
+                if matched_word == word:
+                    weight += exact_word_weight  # increase weight for exact matching word
+
+                id_counter = self.weigh_identifiers(self.database_words[matched_word], weight, id_counter)
+                matched_identifiers[word] = id_counter
+            self.database_search_cache(self.current_database, word, matched_identifiers[word])
+
+        return matched_identifiers
+
     def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True) -> list:
         """Overwritten for extra database specific reduction of results.
         """
@@ -200,6 +297,7 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
 
         # DATABASE SPECIFIC get the set of ids that is in this database
         self.database_id_manager(database)
+        self.database_word_manager(database)
 
         queries = self.build_queries(text)
 
@@ -279,17 +377,21 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
                 # now search for all permutations of this query combined with a space
                 query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)]
                 for query_perm in permutations(query):
-                    mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"])
-                    new_df = query_df.loc[mask].reset_index(drop=True)
-                    if len(new_df) == 0:
-                        # there is no match for this permutation of words, skip
-                        continue
-                    new_id_list = new_df[self.identifier_name]
-
-                    new_ids = Counter()
-                    for new_id in new_id_list:
-                        new_ids[new_id] = query_identifiers[new_id]
-
+                    query_perm_str = " ".join(query_perm)
+                    if result := self.database_search_cache(self.current_database, query_perm_str):
+                        new_ids = result
+                    else:
+                        mask = self.filter_dataframe(query_df, query_perm_str, search_columns=["query_col"])
+                        new_df = query_df.loc[mask].reset_index(drop=True)
+                        if len(new_df) == 0:
+                            # there is no match for this permutation of words, skip
+                            continue
+                        new_id_list = new_df[self.identifier_name]
+
+                        new_ids = Counter()
+                        for new_id in new_id_list:
+                            new_ids[new_id] = query_identifiers[new_id]
+                        self.database_search_cache(self.current_database, query_perm_str, new_ids)
                     # we weigh a combination of words that is next also to each other even higher than just the words separately
                     query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight,
                                                                              query_to_identifier[query_name])
@@ -298,14 +400,15 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter
         for identifiers in query_to_identifier.values():
             all_identifiers += identifiers
 
+        if return_counter:
+           return_this = all_identifiers
+        else:
+            # now sort on highest weights and make list type
+            return_this = [identifier[0] for identifier in all_identifiers.most_common()]
         if logging:
             log.debug(
                 f"Found {len(all_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds")
-        if return_counter:
-            return all_identifiers
-        # now sort on highest weights and make list type
-        sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()]
-        return sorted_identifiers
+        return return_this
 
     def search(self, text, database: Optional[str] = None) -> list:
         """Search the dataframe on this text, return a sorted list of identifiers."""

From 9734ad2c467b549b87caeb1e23b4e768c8750e66 Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 16 Sep 2025 11:55:52 +0200
Subject: [PATCH 46/47] bold only current word, not all search suggested words

---
 activity_browser/ui/widgets/text_edit.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/activity_browser/ui/widgets/text_edit.py b/activity_browser/ui/widgets/text_edit.py
index aff4344ae..9daf4fabe 100644
--- a/activity_browser/ui/widgets/text_edit.py
+++ b/activity_browser/ui/widgets/text_edit.py
@@ -29,14 +29,12 @@ def highlightBlock(self, text: str):
 
 
 class AutoCompleteDelegate(QStyledItemDelegate):
-    def __init__(self, parent=None, get_bold_word_func=None):
+    def __init__(self, parent=None):
         super().__init__(parent)
-        self.get_bold_word_func = get_bold_word_func
+        self.current_word_index = -1
 
     def paint(self, painter, option, index):
         text = index.data(Qt.DisplayRole)
-        bold_words = self.get_bold_word_func()
-        bold_words = {word.lower() for word in bold_words}
 
         painter.save()
 
@@ -55,9 +53,9 @@ def paint(self, painter, option, index):
         font = option.font
         metrics = painter.fontMetrics()
 
-        for word in words:
+        for i, word in enumerate(words):
             word_font = QFont(font)
-            if word.lower() in bold_words:
+            if i+1 == self.current_word_index:
                 word_font.setBold(True)
             painter.setFont(word_font)
 
@@ -97,16 +95,14 @@ class ABAutoCompleTextEdit(ABTextEdit):
     def __init__(self, parent=None, highlight_unknown=False):
         super().__init__(parent=parent)
         self.auto_complete_word = ""
-        self.auto_complete_suggestions = []
 
         # autocompleter settings
         self.model = QStringListModel()
         self.completer = QCompleter(self.model)
         self.completer.setWidget(self)
         self.popup = self.completer.popup()
-        # set custom delegate to bold the current word
-        delegate = AutoCompleteDelegate(self.popup, get_bold_word_func=lambda: self.auto_complete_suggestions)
-        self.popup.setItemDelegate(delegate)
+        self.delegate = AutoCompleteDelegate(self.popup) # set custom delegate to bold the current word
+        self.popup.setItemDelegate(self.delegate)
         self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
         self.completer.setPopup(self.popup)
         self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) # allow all items in popup list
@@ -228,10 +224,10 @@ def _set_autocomplete_items(self):
         self.auto_complete_word = current_word
 
         context = set((text[:start] + text[end:]).split(" "))
+        self.delegate.current_word_index = len(text[:start].split(" "))  # current word index for bolding
         # get suggestions for the current word
         suggestions = AB_metadata.auto_complete(current_word, context=context, database=self.database_name)
         suggestions = suggestions[:6]  # at most 6, though we should get ~3 usually
-        self.auto_complete_suggestions = suggestions  # set for bolding of autocomplete suggestions
         # replace the current word with each alternative
         items = []
         for alt in suggestions:

From e342f2247f82f0980ea912b5ae604f9ac59ea29b Mon Sep 17 00:00:00 2001
From: marc-vdm <m.t.van.der.meide@cml.leidenuniv.nl>
Date: Tue, 16 Sep 2025 12:51:42 +0200
Subject: [PATCH 47/47] enable dealing with empty metadata in tests

---
 activity_browser/bwutils/searchengine/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py
index 5a7752e3a..5b9127985 100644
--- a/activity_browser/bwutils/searchengine/base.py
+++ b/activity_browser/bwutils/searchengine/base.py
@@ -114,6 +114,9 @@ def update_dict(update_me: dict, new: dict) -> dict:
             update_me = update_me | new_data
             return update_me
 
+        if len(update_df) == 0:
+            return
+
         t = time()
         size_old = len(self.df)
         # identifier to word and df