From fa68328058373c296053dea7bc5d57f7a3ce4564 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Mon, 21 Jul 2025 14:15:45 +0200 Subject: [PATCH 01/47] search class --- activity_browser/bwutils/search/__init__.py | 1 + .../bwutils/search/searchengine.py | 627 ++++++++++++++++++ 2 files changed, 628 insertions(+) create mode 100644 activity_browser/bwutils/search/__init__.py create mode 100644 activity_browser/bwutils/search/searchengine.py diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py new file mode 100644 index 000000000..f9fde759c --- /dev/null +++ b/activity_browser/bwutils/search/__init__.py @@ -0,0 +1 @@ +from searchengine import SearchEngine \ No newline at end of file diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py new file mode 100644 index 000000000..84a9c6333 --- /dev/null +++ b/activity_browser/bwutils/search/searchengine.py @@ -0,0 +1,627 @@ +from itertools import permutations, chain +import itertools +import functools +from collections import Counter, OrderedDict, Iterable +import pandas as pd +import numpy as np +import re + + +class SearchEngine: + """ + A Search Engine class, takes a dataframe and makes it searchable. + + A search requires a string, and will return a list of unique identifiers in the dataframe. + There are three options for search: + SearchEngine.literal_search(): searches for exact matches of the search query + SearchEngine.fuzzy_search(): searches for approximate matches of search query, sorted by relevance + SearchEngine.search(): combines both of the above, literal matches are returned first, next all fuzzy results, buth subsets sorted by relevance + It is recommended to always use searchEngine.search(), but the other options are there. + + Initialization takes: + df: Dataframe that needs to be searchable. + identifier_name: values in this column will be returned as search results, all values in this column need to be unique. + searchable_columns: these columns need to be searchable, if none are given, all columns will be made searchable. + + Updating data is possible as well: + add_identifier(): adds this identifier to the searchable data + remove_identifier(): removes this identifier from the searchable data + change_identifier(): changes this identifier (wrapper for remove_identifier and add_identifier) + + """ + + def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: list = []): + + # compile regex patterns for cleaning + self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]") # for replacing with empty string + self.SPACE_PATTERN = re.compile(r"[-−:;]") # for replacing with space + self.ONE_SPACE_PATTERN = re.compile(r"\s+") # for replacing multiple white space with 1 space + + self.q = 2 # character lenght of q grams + self.base_weight = 10 # base weigthing for sorting results + + assert identifier_name in df.columns # make sure identifier col exist + assert df[identifier_name].nunique() == df.shape[0] # make sure identifiers are all unique + self.identifier_name = identifier_name + + # ensure columns given actually exist + # always ensure "identifier" is present + if searchable_columns == []: + # if no list is given, assume all columns are searchable + self.columns = list(df.columns) + else: + # create subset of columns to be searchable, discard rest + self.columns = [col for col in searchable_columns if col in df.columns] + if self.identifier_name not in self.columns: # keep identifier col + self.columns.append(self.identifier_name) + df = df[self.columns] + # set the identifier column as index + df = df.set_index(self.identifier_name, drop=False) + + # convert all data to str + df = df.astype(str) + + # find the self.identifier_name column index and store as int + self.identifier_column = self.columns.index(self.identifier_name) + + # store all searchable columns except the identifier + self.regular_columns = [i for i in range(len(self.columns)) if i != self.identifier_column] + + # initialize search index dicts and update df + self.identifier_to_word = {} + self.word_to_identifier = {} + self.word_to_q_grams = {} + self.q_gram_to_word = {} + self.df = pd.DataFrame() + + self.update_index(df) + + # +++ Utility functions + + def update_index(self, update_df: pd.DataFrame) -> None: + """Update search index dicts and the df.""" + + def update_dict(update_me: dict, new: dict) -> dict: + """Update a dict of counters with new dict of counters.""" + for dict_key, _counter in new.items(): + if dict_key in update_me: + update_me[dict_key].update(_counter) + else: + update_me[dict_key] = _counter + return update_me + + # identifier to word and df + i2w, update_df = self.words_in_df(update_df) + self.identifier_to_word = update_dict(self.identifier_to_word, i2w) + self.df = pd.concat([self.df, update_df]) + + # word to identifier + w2i = self.reverse_dict_many_to_one(i2w) + self.word_to_identifier = update_dict(self.word_to_identifier, w2i) + + # word to qgram + w2q = self.list_to_q_grams(w2i.keys()) + self.word_to_q_grams = update_dict(self.word_to_q_grams, w2q) + + # gram to word + q2w = self.reverse_dict_many_to_one(w2q) + self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w) + + def clean_text(self, text: str): + """Clean a string so it doesn't contain weird characters or multiple spaces etc.""" + text = self.SUB_PATTERN.sub("", text.lower()) + text = self.SPACE_PATTERN.sub(" ", text) + text = self.ONE_SPACE_PATTERN.sub(" ", text).strip() + return text + + def text_to_positional_q_gram(self, text: str) -> list: + """Return a positional list of qgrams for the given string. + + https://en.wikipedia.org/wiki/N-gram + q-grams are n-grams on character level. + + qgrams of "word" would be "wo", "or" and "rd" for q=2 + + Note: these are technically positional q grams, but we don't use their + positions currently + """ + q = self.q + + # just return a single-item list if the text is equal or shorter than q + # else, generate qgrams + if len(text) <= q: + return [text] + else: + return [text[i:i + q] for i in range(len(text) - q + 1)] + + def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: + """Return a dict of {identifier: word} for df.""" + + df = df if any(df) else self.df + return_df = df.copy() + + df = df.iloc[:, self.regular_columns] + identifier_word_dict = {} + col = [] + + for row in df.itertuples(index=True): + line = self.clean_text(" ".join(row[1:])) + col.append(line) + identifier_word_dict[row[0]] = Counter(line.split(" ")) + + return_df["query_col"] = col + + return identifier_word_dict, return_df + + def reverse_dict_many_to_one(self, dictionary: dict) -> dict: + """Reverse a dictionary of Counter objects.""" + reverse = {} + for identifier, counter_object in dictionary.items(): + for countable, count in counter_object.items(): + if countable not in reverse: + reverse[countable] = Counter() + reverse[countable][identifier] += count + return reverse + + def list_to_q_grams(self, word_list: Iterable) -> dict: + """Convert a list of unique words to a dict with Counter objects. + + + q_gram_dict = { + "word": Counter( + "wo": 1 + "or": 1 + "rd": 1 + ) + } + + """ + q_gram_dict = {} + + for word in word_list: + q_gram_dict[word] = Counter(self.text_to_positional_q_gram(word)) + + return q_gram_dict + + # +++ Changes to searchable data + + def add_identifier(self, identifier, data: dict) -> None: + """Add this identifier to the search index. + + identifier is expected to be a unique identifier that has not been used before + data is expected to be a dict of column names and data + """ + + # make sure we don't add an identifier that already exists + assert identifier not in self.df.index.to_list() + + df_cols = self.columns + + # drop fields that are not in self.df + drop = [col for col in data if col not in df_cols] + for field in drop: + del data[field] + + # add empty field for missing data + for col in df_cols: + if col not in data: + data[col] = "" + + # convert to df + new_df = pd.DataFrame(data, index=[identifier]) + new_df = new_df.astype(str) + + # update the search index data + self.update_index(new_df) + + def remove_identifier(self, identifier) -> None: + """Remove this identifier from self.df and the search index. + """ + + # remove from df + self.df.drop(identifier, inplace=True) + + # find words that may need to be removed + words = self.identifier_to_word[identifier] + for word in words: + if len(self.word_to_identifier[word]) == 1: + # this word is only found in this identifier, + # remove the word and check for q grams + del self.word_to_identifier[word] + + q_grams = self.word_to_q_grams[word] + for q_gram in q_grams: + if len(self.q_gram_to_word[q_gram]) == 1: + # this q_gram is only used in this word, + # remove it + del self.q_gram_to_word[q_gram] + + del self.word_to_q_grams[word] + else: + # remove the identifier from the + del self.word_to_identifier[word][identifier] + # finally, remove the identifier + del self.identifier_to_word[identifier] + + def change_identifier(self, identifier, data: dict) -> None: + """Change this identifier. + + identifier is expected to be a unique identifier that is in use + data is expected to be a dict of column names and data that change + + only changed data needs to be supplied + """ + assert identifier in self.df.index.to_list() + + # get existing data + update_data = dict(self.df.loc[identifier].values) + + # overwrite new data where relevant + for field, value in data.items(): + update_data[field] = value + + # remove the entry + self.remove_identifier(identifier) + + # add entry with new data + self.add_identifier(identifier, update_data) + + # +++ Search + + def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: list = None) -> pd.Series: + """Filter the search columns of a dataframe on a pattern. + + Returns a mask (true/false) pd.Series with matching items.""" + + search_columns = search_columns if search_columns else self.columns + + mask = functools.reduce( + np.logical_or, + [ + df[col].apply(lambda x: pattern in x.lower()) + for col in search_columns + ], + ) + return mask + + def literal_search(self, text): + """Do literal search of the text in all original columns that were given.""" + + identifiers = self.filter_dataframe(self.df, text) + df = self.df.loc[identifiers] + identifiers = df.index.to_list() + + return identifiers + + def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: int = 1000) -> int: + """Calculate the Optimal String Alignment (OSA) edit distance between two strings, return edit distance. + + Has additional cutoff variable, if cutoff is higher than 0 and if the words have + a larger difference in length, immediately return a large number + + OSA is a restricted form of the Damerau–Levenshtein distance. + https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance + + The edit distance is how many operations (insert, delete, substitute or transpose a character) need to happen to convert one string to another. + insert and delete are obvious operations, but substitute and transpose are explained: + substitute: replace one character with another: e.g. word1=cat word2=cab, t->b substitution is 1 operation + transpose: swap the places of two adjacent characters with each other: e.g. word1=coal word2=cola al -> la transposition is 1 operation + + The minimum amount of operations (OSA edit distance) is returned. + """ + + if word1 == word2: + # if the strings are the same, immediately return 0 + return 0 + + len1, len2 = len(word1), len(word2) + + if 0 < cutoff < abs(len1 - len2): + # if the length difference between 2 words is over the cutoff, + # just return instead of calculating the edit distance + return cutoff_return + + if len1 == 0 or len2 == 0: + # in case (at least) one of the strings is empty, + # return the lenth of the longest string + return max(len1, len2) + + # Initialize matrix + distance = [[0] * len2 for _ in range(len1)] + + # calculate shortest edit distance + for i in range(len1): + for j in range(len2): + cost = 0 if word1[i] == word2[j] else 1 + + # Compute distances for insertion, deletion and substitution + insertion = distance[i][j - 1] + 1 if j > 0 else i + 1 + deletion = distance[i - 1][j] + 1 if i > 0 else j + 1 + substitution = distance[i - 1][j - 1] + cost if i > 0 and j > 0 else max(i, j) + cost + + distance[i][j] = min(deletion, insertion, substitution) + + # Compute transposition when relevant + if i > 0 and j > 0 and word1[i] == word2[j - 1] and word1[i - 1] == word2[j]: + transposition = distance[i - 2][j - 2] + 1 if i > 1 and j > 1 else max(i, j) - 1 + distance[i][j] = min(distance[i][j], transposition) + + return distance[len1 - 1][len2 - 1] + + def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame: + """Find which of the given q_grams exist in self.q_gram_to_word, + return a sorted dataframe of best matching words. + """ + n_q_grams = len(q_grams) + + matches = {} + + # find words that match our qgrams + for q_gram in q_grams: + if words := self.q_gram_to_word.get(q_gram, False): + # q_gram exists in our search index + for word in words: + matches[word] = matches.get(word, 0) + words[word] + + # if we find no results, return an empty dataframe + if len(matches) == 0: + return pd.DataFrame({"word": [], "matches": []}) + + # otherwise, create a dataframe and + # reduce search results to most relevant results + matches = {"word": matches.keys(), "matches": matches.values()} + matches = pd.DataFrame(matches) + max_q = max(matches["matches"]) + + # determine how many results we want to keep based on how good our results are + min_q = max(max_q * 0.5, # have at least half of qgrams of best match or... + max(n_q_grams * 0.5, # if more, at least half the qgrams in the query word? + 1)) # okay just do 1 qgram if there are no more in the word + + matches = matches[matches["matches"] >= min_q] + matches = matches.sort_values(by="matches", ascending=False) + matches = matches.reset_index(drop=True) + + return matches.iloc[:min(len(matches), 2500), :] # return at most this many results + + def spell_check(self, text: str) -> OrderedDict: + """Create an OrderedDict of each word in the text (space separated) + with as values possible alternatives. + + Alternatives are first found with q-grams, then refined with string edit distance + + We rank alternative words based on 1) edit distance 2) how often a word is used in an entry + If too many results are found, we only keep edit distance 1, + if we want more results, we keep with longer edit distance up to ... + + + word_results = OrderedDict( + "word": [word, work] + ) + + """ + + word_results = OrderedDict() + + matches_goal = 3 # ideally we have at least this many alternatives + + always_accept_this = 1 # values of this or lower always accepted + never_accept_this = 4 # values this or over always rejected + + # make list of unique words + words = OrderedDict() + for word in text.split(" "): + words[word] = False + words = words.keys() + + words = [self.clean_text(word) for word in words] + + for word in words: + + # first, find possible matches quickly + q_grams = self.text_to_positional_q_gram(word) + possible_matches = self.find_q_gram_matches(q_grams) + + matches = [] + other_matches = {} + + # now, refine with levenshtein + for row in possible_matches.itertuples(): + + edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this) + + if edit_distance == 0: + continue # we are looking for alternatives only, not the exact word + elif edit_distance <= always_accept_this: + matches.append(row[1]) + elif edit_distance < never_accept_this: + if other_matches.get(edit_distance): + other_matches[edit_distance].append(row[1]) + else: + other_matches[edit_distance] = [row[1]] + + # if we have fewer matches than goal, add more 'less good' matches + if len(matches) < matches_goal: + for i in range(always_accept_this + 1, never_accept_this): + # iteratively increate worse matches + if new := other_matches.get(i): + matches = matches + new + + if len(matches) >= matches_goal: + break + + word_results[word] = matches + + return word_results + + def build_queries(self, query_text) -> list: + """Make all possible subsets of words in the query, including alternative words.""" + query_text = self.spell_check(query_text) + + # find all combinations of the query words as given + queries = list(query_text.keys()) + subsets = list(chain.from_iterable( + (itertools.combinations( + queries, r) for r in range(1, len(queries) + 1)))) + all_queries = [] + + for combination in subsets: + # add the 'default' option + all_queries.append(combination) + # now add all options with all alternatives + for i, word in enumerate(combination): + for alternative in query_text.get(word, []): + alternative_combination = list(combination) + alternative_combination[i] = alternative + all_queries.append(alternative_combination) + + return all_queries + + def weigh_identifiers(self, identifiers: Iterable, weight: int, weighted_ids: Counter) -> Counter: + """Add weights to identifier counter for these identifiers""" + + for identifier in identifiers: + weighted_ids[identifier] += int(weight) + + return weighted_ids + + def search_size_1(self, queries: list, original_words: list, orig_word_weight=11, exact_word_weight=1) -> dict: + """Return a dict of {query_word: Counter(identifier)}. + + queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word + original words: a list of words actually searched for (not including spellechecked) + + orig_word_weight: additional weight to add to original words + exact_word_weight: additional weight to add to exact word matches (as opposed to be 'in' str) + + First, we find all matching words, creating a dict of words in 'queries' as keys and words matching that query word as list of values + Next, we convert this to identifiers and add weights: + Weight will be increased if matching 'orig_word_weight' or 'exact_word_weight' + """ + matches = {} + # add each word in search index if query_word in word + for word in self.word_to_identifier.keys(): + for query in queries: + # query is list/tuple of len 1 + query_word = query[0] # only use the word + if query_word in word: + words = matches.get(query_word, []) + words.extend([word]) + matches[query_word] = words + + # now convert matched words to matched identifiers + matched_identifiers = {} + for word, matching_words in matches.items(): + for matched_word in matching_words: + weight = self.base_weight + + id_counter = matched_identifiers.get(word, Counter()) + + # add the word n times, where n is the weight, original search word is weighted higher than alternatives + if matched_word in original_words: + weight += orig_word_weight # increase weight for original word + if matched_word == word: + weight += exact_word_weight # increase weight for exact matching word + + id_counter = self.weigh_identifiers(self.word_to_identifier[matched_word], weight, id_counter) + matched_identifiers[word] = id_counter + + return matched_identifiers + + def fuzzy_search(self, text: str) -> list: + """Search the dataframe, finding approximate matches and return a list of identifiers, + ranked by how well each identifier matches the search text. + """ + + queries = self.build_queries(text) + + # make list of unique original words + orig_words = OrderedDict() + for word in text.split(" "): + orig_words[word] = False + orig_words = orig_words.keys() + orig_words = [self.clean_text(word) for word in orig_words] + + # order the queries by the amount of words they contain + # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space + queries_by_size = OrderedDict() + longest_query = max([len(q) for q in queries]) + for query_len in range(1, longest_query + 1): + queries_by_size[query_len] = [q for q in queries if len(q) == query_len] + + # first handle queries of length 1 + query_to_identifier = self.search_size_1(queries_by_size[1], orig_words) + + # get all results into a df, we rank further later + all_identifiers = set() + for id_list in [id_list for id_list in query_to_identifier.values()]: + all_identifiers.update(id_list) + search_df = self.df.loc[list(all_identifiers)] + + # now, we search for combinations of query words and get only those identifiers + # we then reduce de search_df further for only those matching identifiers + # we then search the permutations of that set of words + for q_len, query_set in queries_by_size.items(): + if q_len == 1: + # we already did these above + continue + for query in query_set: + + # get the intersection of all identifiers + # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query + # this ensures we only ever search data where ALL items occur to reduce search-space + query_identifiers = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if + query_to_identifier.get(q_word, False)]) + if len(query_identifiers) == 0: + # there is no match for this combination of query words, skip + break + + # we now add these identifiers to a counter for this query name, + query_name = " ".join(query) + + weight = self.base_weight * q_len + query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter()) + + # now search for all permutations of this query combined with a space + query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)] + for query_perm in permutations(query): + mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"]) + new_df = query_df.loc[mask].reset_index(drop=True) + if len(new_df) == 0: + # there is no match for this permutation of words, skip + continue + new_ids = list(new_df[self.identifier_name]) + # we weigh a combination of words and next to each other even higher than just the words separately + weight = self.base_weight * q_len * q_len + query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight, + query_to_identifier[query_name]) + + # now finally, move to one object sorted list by highest score + all_identifiers = Counter() + for identifiers in query_to_identifier.values(): + all_identifiers += identifiers + + # now sort on highest weights and make list type + sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] + return sorted_identifiers + + def search(self, text) -> list: + """Search the dataframe on this text, return a sorted list of identifiers""" + + literal_identifiers = self.literal_search(text) + fuzzy_identifiers = self.fuzzy_search(text) + + ordered_literal_identifiers = [] + other_identifiers = [] + + # add all fuzzy identifiers to one of two lists, depending on whether they were found in literal search or not + # this guarantees we put the literal matches on top, but still sort within this group based on fuzzy scores + for identifier in fuzzy_identifiers: + if identifier in literal_identifiers: + ordered_literal_identifiers.append(identifier) + else: + other_identifiers.append(identifier) + + identifiers = ordered_literal_identifiers + other_identifiers + + return identifiers From 40bf40747f3e3d1b8422f5526f8662f902acf2a8 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 22 Jul 2025 10:59:33 +0200 Subject: [PATCH 02/47] Include correct init file as well --- activity_browser/bwutils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/activity_browser/bwutils/__init__.py b/activity_browser/bwutils/__init__.py index bac9adccd..4bb75c292 100644 --- a/activity_browser/bwutils/__init__.py +++ b/activity_browser/bwutils/__init__.py @@ -12,6 +12,7 @@ from .montecarlo import MonteCarloLCA from .multilca import MLCA, Contributions from .pedigree import PedigreeMatrix +from .search import SearchEngine from .sensitivity_analysis import GlobalSensitivityAnalysis from .superstructure import SuperstructureContributions, SuperstructureMLCA from .uncertainty import (CFUncertaintyInterface, ExchangeUncertaintyInterface, From 413ac1ae1bcc958e70628367c64debdfa6904be4 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 23 Jul 2025 15:51:06 +0200 Subject: [PATCH 03/47] Search tests, minor corrections, better documentation --- activity_browser/bwutils/search/__init__.py | 2 +- .../bwutils/search/searchengine.py | 168 ++++++++++------ tests/test_search.py | 181 ++++++++++++++++++ 3 files changed, 289 insertions(+), 62 deletions(-) create mode 100644 tests/test_search.py diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py index f9fde759c..042045a6b 100644 --- a/activity_browser/bwutils/search/__init__.py +++ b/activity_browser/bwutils/search/__init__.py @@ -1 +1 @@ -from searchengine import SearchEngine \ No newline at end of file +from .searchengine import SearchEngine \ No newline at end of file diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index 84a9c6333..a444e3d19 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -1,7 +1,8 @@ from itertools import permutations, chain import itertools import functools -from collections import Counter, OrderedDict, Iterable +from collections import Counter, OrderedDict +from typing import Iterable import pandas as pd import numpy as np import re @@ -15,7 +16,8 @@ class SearchEngine: There are three options for search: SearchEngine.literal_search(): searches for exact matches of the search query SearchEngine.fuzzy_search(): searches for approximate matches of search query, sorted by relevance - SearchEngine.search(): combines both of the above, literal matches are returned first, next all fuzzy results, buth subsets sorted by relevance + SearchEngine.search(): combines both of the above, literal matches are returned first, next all fuzzy results, + but subsets sorted by relevance. It is recommended to always use searchEngine.search(), but the other options are there. Initialization takes: @@ -37,11 +39,15 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l self.SPACE_PATTERN = re.compile(r"[-−:;]") # for replacing with space self.ONE_SPACE_PATTERN = re.compile(r"\s+") # for replacing multiple white space with 1 space - self.q = 2 # character lenght of q grams - self.base_weight = 10 # base weigthing for sorting results + self.q = 2 # character length of q grams + self.base_weight = 10 # base weighting for sorting results + + if identifier_name not in df.columns: # make sure identifier col exist + raise NameError(f"Identifier column {identifier_name} not found in dataframe. Use an existing column name.") + if df[identifier_name].nunique() != df.shape[0]: # make sure identifiers are all unique + raise KeyError( + f"Identifier column {identifier_name} must only contain unique values. Found {df[identifier_name].nunique()} unique values for length {df.shape[0]}") - assert identifier_name in df.columns # make sure identifier col exist - assert df[identifier_name].nunique() == df.shape[0] # make sure identifiers are all unique self.identifier_name = identifier_name # ensure columns given actually exist @@ -99,11 +105,11 @@ def update_dict(update_me: dict, new: dict) -> dict: w2i = self.reverse_dict_many_to_one(i2w) self.word_to_identifier = update_dict(self.word_to_identifier, w2i) - # word to qgram + # word to q-gram w2q = self.list_to_q_grams(w2i.keys()) self.word_to_q_grams = update_dict(self.word_to_q_grams, w2q) - # gram to word + # q-gram to word q2w = self.reverse_dict_many_to_one(w2q) self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w) @@ -115,20 +121,18 @@ def clean_text(self, text: str): return text def text_to_positional_q_gram(self, text: str) -> list: - """Return a positional list of qgrams for the given string. + """Return a positional list of q-grams for the given string. - https://en.wikipedia.org/wiki/N-gram q-grams are n-grams on character level. + q-grams at q=2 of "word" would be "wo", "or" and "rd" + https://en.wikipedia.org/wiki/N-gram - qgrams of "word" would be "wo", "or" and "rd" for q=2 - - Note: these are technically positional q grams, but we don't use their - positions currently + Note: these are technically _positional_ q-grams, but we don't use their positions currently. """ q = self.q # just return a single-item list if the text is equal or shorter than q - # else, generate qgrams + # else, generate q-grams if len(text) <= q: return [text] else: @@ -145,10 +149,9 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: col = [] for row in df.itertuples(index=True): - line = self.clean_text(" ".join(row[1:])) + line = self.clean_text(" | ".join(row[1:])) col.append(line) identifier_word_dict[row[0]] = Counter(line.split(" ")) - return_df["query_col"] = col return identifier_word_dict, return_df @@ -166,6 +169,7 @@ def reverse_dict_many_to_one(self, dictionary: dict) -> dict: def list_to_q_grams(self, word_list: Iterable) -> dict: """Convert a list of unique words to a dict with Counter objects. + Number will be the occurrences of that q-gram in that word. q_gram_dict = { "word": Counter( @@ -191,9 +195,13 @@ def add_identifier(self, identifier, data: dict) -> None: identifier is expected to be a unique identifier that has not been used before data is expected to be a dict of column names and data """ - - # make sure we don't add an identifier that already exists - assert identifier not in self.df.index.to_list() + # make sure we the identifier does not yet exist + if identifier in self.df.index.to_list(): + raise Exception( + f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.") + if data[self.identifier_name] != identifier: + raise Exception( + f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.") df_cols = self.columns @@ -217,6 +225,10 @@ def add_identifier(self, identifier, data: dict) -> None: def remove_identifier(self, identifier) -> None: """Remove this identifier from self.df and the search index. """ + # make sure the identifier exists + if identifier not in self.df.index.to_list(): + raise Exception( + f"Identifier '{identifier}' does not exist in the search data, cannot remove identifier that do not exist.") # remove from df self.df.drop(identifier, inplace=True) @@ -238,7 +250,7 @@ def remove_identifier(self, identifier) -> None: del self.word_to_q_grams[word] else: - # remove the identifier from the + # remove the identifier from the dict del self.word_to_identifier[word][identifier] # finally, remove the identifier del self.identifier_to_word[identifier] @@ -251,10 +263,17 @@ def change_identifier(self, identifier, data: dict) -> None: only changed data needs to be supplied """ - assert identifier in self.df.index.to_list() + # make sure the identifier exists + if identifier not in self.df.index.to_list(): + raise Exception( + f"Identifier '{identifier}' does not exist in the search data, use an existing identifier or use the add_identifier function.") + if data[self.identifier_name] != identifier: + raise Exception( + f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.") # get existing data - update_data = dict(self.df.loc[identifier].values) + update_data = {col: self.df.loc[identifier, col] for col in self.df.columns} + del update_data["query_col"] # overwrite new data where relevant for field, value in data.items(): @@ -297,35 +316,39 @@ def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: i """Calculate the Optimal String Alignment (OSA) edit distance between two strings, return edit distance. Has additional cutoff variable, if cutoff is higher than 0 and if the words have - a larger difference in length, immediately return a large number + a larger edit distance, return a large number (note: cutoff <= edit_dist, not cutoff < edit_dist) OSA is a restricted form of the Damerau–Levenshtein distance. https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance The edit distance is how many operations (insert, delete, substitute or transpose a character) need to happen to convert one string to another. insert and delete are obvious operations, but substitute and transpose are explained: - substitute: replace one character with another: e.g. word1=cat word2=cab, t->b substitution is 1 operation - transpose: swap the places of two adjacent characters with each other: e.g. word1=coal word2=cola al -> la transposition is 1 operation + substitute: replace one character with another: e.g. word1='cat' word2='cab', 't'->'b' substitution is 1 operation + transpose: swap the places of two adjacent characters with each other: e.g. word1='coal' word2='cola' 'al' -> 'la' transposition is 1 operation - The minimum amount of operations (OSA edit distance) is returned. + The minimum amount of edit operations (OSA edit distance) is returned. """ - if word1 == word2: # if the strings are the same, immediately return 0 return 0 len1, len2 = len(word1), len(word2) - if 0 < cutoff < abs(len1 - len2): + if 0 < cutoff <= abs(len1 - len2): # if the length difference between 2 words is over the cutoff, # just return instead of calculating the edit distance return cutoff_return if len1 == 0 or len2 == 0: # in case (at least) one of the strings is empty, - # return the lenth of the longest string + # return the length of the longest string return max(len1, len2) + if len1 < len2 and cutoff > 0: + # make sure word1 is always the longest (required for early stopping with cutoff) + word1, word2 = word2, word1 + len1, len2 = len2, len1 + # Initialize matrix distance = [[0] * len2 for _ in range(len1)] @@ -346,9 +369,12 @@ def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: i transposition = distance[i - 2][j - 2] + 1 if i > 1 and j > 1 else max(i, j) - 1 distance[i][j] = min(distance[i][j], transposition) - return distance[len1 - 1][len2 - 1] + # stop early if we surpass cutoff + if 0 < cutoff <= distance[i][j]: + return cutoff_return + return distance[i][j] - def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame: + def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: """Find which of the given q_grams exist in self.q_gram_to_word, return a sorted dataframe of best matching words. """ @@ -356,7 +382,7 @@ def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame: matches = {} - # find words that match our qgrams + # find words that match our q-grams for q_gram in q_grams: if words := self.q_gram_to_word.get(q_gram, False): # q_gram exists in our search index @@ -371,12 +397,12 @@ def find_q_gram_matches(self, q_grams: list) -> pd.DataFrame: # reduce search results to most relevant results matches = {"word": matches.keys(), "matches": matches.values()} matches = pd.DataFrame(matches) - max_q = max(matches["matches"]) + max_q = max(matches["matches"]) # this has the most matching q-grams # determine how many results we want to keep based on how good our results are - min_q = max(max_q * 0.5, # have at least half of qgrams of best match or... - max(n_q_grams * 0.5, # if more, at least half the qgrams in the query word? - 1)) # okay just do 1 qgram if there are no more in the word + min_q = max(max_q * 0.32, # have at least a third of q-grams of best match or... + max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? + 1)) # okay just do 1 q-gram if there are no more in the word matches = matches[matches["matches"] >= min_q] matches = matches.sort_values(by="matches", ascending=False) @@ -400,11 +426,9 @@ def spell_check(self, text: str) -> OrderedDict: ) """ - word_results = OrderedDict() matches_goal = 3 # ideally we have at least this many alternatives - always_accept_this = 1 # values of this or lower always accepted never_accept_this = 4 # values this or over always rejected @@ -420,12 +444,12 @@ def spell_check(self, text: str) -> OrderedDict: # first, find possible matches quickly q_grams = self.text_to_positional_q_gram(word) - possible_matches = self.find_q_gram_matches(q_grams) + possible_matches = self.find_q_gram_matches(set(q_grams)) matches = [] other_matches = {} - # now, refine with levenshtein + # now, refine with edit distance for row in possible_matches.itertuples(): edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this) @@ -443,13 +467,11 @@ def spell_check(self, text: str) -> OrderedDict: # if we have fewer matches than goal, add more 'less good' matches if len(matches) < matches_goal: for i in range(always_accept_this + 1, never_accept_this): - # iteratively increate worse matches + # iteratively increase 'worse' matches so we hit goal of minimum alternatives if new := other_matches.get(i): matches = matches + new - if len(matches) >= matches_goal: break - word_results[word] = matches return word_results @@ -477,19 +499,17 @@ def build_queries(self, query_text) -> list: return all_queries - def weigh_identifiers(self, identifiers: Iterable, weight: int, weighted_ids: Counter) -> Counter: - """Add weights to identifier counter for these identifiers""" - - for identifier in identifiers: - weighted_ids[identifier] += int(weight) - + def weigh_identifiers(self, identifiers: Counter, weight: int, weighted_ids: Counter) -> Counter: + """Add weights to identifier counter for these identifiers times how often it occurs in identifier.""" + for identifier, occurrences in identifiers.items(): + weighted_ids[identifier] += (weight * occurrences) return weighted_ids - def search_size_1(self, queries: list, original_words: list, orig_word_weight=11, exact_word_weight=1) -> dict: + def search_size_1(self, queries: list, original_words: list, orig_word_weight=5, exact_word_weight=1) -> dict: """Return a dict of {query_word: Counter(identifier)}. queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word - original words: a list of words actually searched for (not including spellechecked) + original words: a list of words actually searched for (not including spellchecked) orig_word_weight: additional weight to add to original words exact_word_weight: additional weight to add to exact word matches (as opposed to be 'in' str) @@ -514,7 +534,6 @@ def search_size_1(self, queries: list, original_words: list, orig_word_weight=11 for word, matching_words in matches.items(): for matched_word in matching_words: weight = self.base_weight - id_counter = matched_identifiers.get(word, Counter()) # add the word n times, where n is the weight, original search word is weighted higher than alternatives @@ -531,6 +550,19 @@ def search_size_1(self, queries: list, original_words: list, orig_word_weight=11 def fuzzy_search(self, text: str) -> list: """Search the dataframe, finding approximate matches and return a list of identifiers, ranked by how well each identifier matches the search text. + + 1. First, identifiers matching single words (and spell-checked alternatives) are found and weighted. + 2. If the search term consisted of multiple words, combinations of those words are checked next. + 2.1 Increasing in size (first two words, then three etc.), we look for identifiers that contain that set of + words, these are also weighted, based on the sum of all one-word weights (from first step) and the length + of the sequence. + 2.2 Next, we also look specifically for combinations occurring next to each other. And add more weight like + the step above (2.1). + We multiply the weighting of step 2 by the sequence length, based on the assumption that finding more search + words will be a more relevant result than just finding a single word, and again if they are in the + correct order. + + Finally, all found identifiers are sorted on their weight and returned. """ queries = self.build_queries(text) @@ -569,13 +601,24 @@ def fuzzy_search(self, text: str) -> list: # get the intersection of all identifiers # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query - # this ensures we only ever search data where ALL items occur to reduce search-space - query_identifiers = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if - query_to_identifier.get(q_word, False)]) - if len(query_identifiers) == 0: + # this ensures we only ever search data where ALL items occur to substantially reduce search-space + # finally, make this a Counter (with each item=1) so we can properly weigh things later + query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if + query_to_identifier.get(q_word, False)]) + if len(query_identifier_set) == 0: # there is no match for this combination of query words, skip break + # now we convert the query identifiers to a Counter of 'occurrence', + # where we weigh queries with only original words higher + query_identifiers = Counter() + for identifier in query_identifier_set: + weight = 0 + for query_word in query: + weight += query_to_identifier[query_word][identifier] + + query_identifiers[identifier] = weight + # we now add these identifiers to a counter for this query name, query_name = " ".join(query) @@ -590,12 +633,15 @@ def fuzzy_search(self, text: str) -> list: if len(new_df) == 0: # there is no match for this permutation of words, skip continue - new_ids = list(new_df[self.identifier_name]) - # we weigh a combination of words and next to each other even higher than just the words separately - weight = self.base_weight * q_len * q_len + new_id_list = new_df[self.identifier_name] + + new_ids = Counter() + for new_id in new_id_list: + new_ids[new_id] = query_identifiers[new_id] + + # we weigh a combination of words that is next also to each other even higher than just the words separately query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight, query_to_identifier[query_name]) - # now finally, move to one object sorted list by highest score all_identifiers = Counter() for identifiers in query_to_identifier.values(): diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 000000000..231859d28 --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,181 @@ +import pytest +import pandas as pd +from activity_browser.bwutils.search import SearchEngine + + +def data_for_test(): + return pd.DataFrame([ + ["a", "coal production", "coal"], + ["b", "coal production", "something"], + ["c", "coal production", "coat"], + ["d", "coal hello production", "something"], + ["e", "dont find me", "hello world"], + ["f", "coat", "another word"], + ["g", "coalispartofthisword", "things"], + ["h", "coal", "coal"], + ], + columns = ["id", "col1", "col2"]) + + +def test_search_init(): + """Do initialization tests.""" + df = data_for_test() + + # init search class with non-existent identifier col and fail + with pytest.raises(Exception): + _ = SearchEngine(df, identifier_name="non_existent_col_name") + # init search class with non-unique identifiers and fail + df2 = df.copy() + df2.iloc[0, 0] = "b" + with pytest.raises(Exception): + _ = SearchEngine(df2, identifier_name="id") + # init search class correctly + se = SearchEngine(df, identifier_name="id") + + +def test_search_base(): + """Do checks for search ranking.""" + + df = data_for_test() + + # init search class and two searches + se = SearchEngine(df, identifier_name="id") + # do search on specific term + assert se.search("coal") == ["a", "h", "c", "b", "d", "g", "f"] + # do search on other term + assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"] + + # init search class with 1 col searchable + se = SearchEngine(df, identifier_name="id", searchable_columns=["col2"]) + assert se.search("coal") == ["a", "h", "c"] + + +def test_search_add_identifier(): + """Do tests for adding identifier.""" + df = data_for_test() + + # create base item to add + new_base_item = { + "id": "i", + "col1": "coal production", + "col2": "coal production" + } + + # use mismatched identifier and fail + se = SearchEngine(df, identifier_name="id") + with pytest.raises(Exception): + se.add_identifier(identifier="j", data=new_base_item) + + # use existing identifier and fail + se = SearchEngine(df, identifier_name="id") + wrong_id = new_base_item.copy() + wrong_id["id"] = "a" + with pytest.raises(Exception): + se.add_identifier(identifier="a", data=wrong_id) + + # use column too many (should be removed) + se = SearchEngine(df, identifier_name="id") + col_more = new_base_item.copy() + col_more["col3"] = "word" + se.add_identifier(identifier="i", data=col_more) + assert "col3" not in se.df.columns + + # use column less (should be filled with empty string) + se = SearchEngine(df, identifier_name="id") + col_less = new_base_item.copy() + del col_less["col2"] + se.add_identifier(identifier="i", data=col_less) + assert se.df.loc["i", "col2"] == "" + + # do search, add item and verify results are different + se = SearchEngine(df, identifier_name="id") + assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"] + se.add_identifier(identifier="i", data=new_base_item) + assert se.search("coal production") == ["i", "a", "c", "b", "d", "h", "f", "g"] + + +def test_search_remove_identifier(): + """Do tests for removing identifier.""" + df = data_for_test() + + # use non-existent identifier and fail + se = SearchEngine(df, identifier_name="id") + with pytest.raises(Exception): + se.remove_identifier(identifier="i") + + # do search, remove item and verify results are different + se = SearchEngine(df, identifier_name="id") + assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"] + se.remove_identifier(identifier="a") + assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"] + + +def test_search_change_identifier(): + """Do tests for changing identifier.""" + df = data_for_test() + + # create base item to add + edit_data = { + "id": "a", + "col1": "cant find me anymore", + "col2": "something different" + } + + # use non-existent identifier and fail + se = SearchEngine(df, identifier_name="id") + missing_id = edit_data.copy() + missing_id["id"] = "i" + with pytest.raises(Exception): + se.change_identifier(identifier="i", data=missing_id) + + # use mismatched identifier and fail + se = SearchEngine(df, identifier_name="id") + wrong_id = edit_data.copy() + wrong_id["id"] = "i" + with pytest.raises(Exception): + se.change_identifier(identifier="a", data=wrong_id) + + # do search, change item and verify results are different + se = SearchEngine(df, identifier_name="id") + assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"] + se.change_identifier(identifier="a", data=edit_data) + assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"] + # now change the same item partially and verify results are different + new_edit_data = { + "id": "a", + "col1": "coal" + } + se.change_identifier(identifier="a", data=new_edit_data) + assert se.search("coal production") == ["c", "b", "d", "h", "a", "f", "g"] + + +def test_string_distance(): + """Do tests specifically for string distance function""" + df = data_for_test() + se = SearchEngine(df, identifier_name="id") + + # same word + assert se.osa_distance("coal", "coal") == 0 + # empty string is length of other word + assert se.osa_distance("coal", "") == 4 + + # insert + assert se.osa_distance("coal", "coa") == 1 + # delete + assert se.osa_distance("coal", "coall") == 1 + # substitute + assert se.osa_distance("coal", "coat") == 1 + # transpose + assert se.osa_distance("coal", "cola") == 1 + + # longer edit distance + assert se.osa_distance("coal", "chocolate") == 6 + # reverse order gives same result + assert se.osa_distance("coal", "chocolate") == se.osa_distance("chocolate", "coal") + # cutoff + assert se.osa_distance("coal", "chocolate", cutoff=5, cutoff_return=1000) == 1000 + assert se.osa_distance("coal", "chocolate", cutoff=6, cutoff_return=1000) == 1000 + assert se.osa_distance("coal", "chocolate", cutoff=7, cutoff_return=1000) == 6 + # length cutoff + assert se.osa_distance("coal", "coallongword") == 8 + assert se.osa_distance("coal", "coallongword", cutoff=5, cutoff_return=1000) == 1000 From d01387fa2d0b9b8d4ad5fde805399a8bdfebea72 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 24 Jul 2025 12:47:39 +0200 Subject: [PATCH 04/47] Improve search speed with many results. --- .../bwutils/search/searchengine.py | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index a444e3d19..d0a722b22 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -2,7 +2,7 @@ import itertools import functools from collections import Counter, OrderedDict -from typing import Iterable +from typing import Iterable, Optional import pandas as pd import numpy as np import re @@ -287,13 +287,12 @@ def change_identifier(self, identifier, data: dict) -> None: # +++ Search - def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: list = None) -> pd.Series: + def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: Optional[list] = None) -> pd.Series: """Filter the search columns of a dataframe on a pattern. Returns a mask (true/false) pd.Series with matching items.""" search_columns = search_columns if search_columns else self.columns - mask = functools.reduce( np.logical_or, [ @@ -303,13 +302,15 @@ def filter_dataframe(self, df: pd.DataFrame, pattern: str, search_columns: list ) return mask - def literal_search(self, text): + def literal_search(self, text, df: Optional[pd.DataFrame] = None) -> list: """Do literal search of the text in all original columns that were given.""" - identifiers = self.filter_dataframe(self.df, text) - df = self.df.loc[identifiers] - identifiers = df.index.to_list() + if df is None: + df = self.df.copy() + identifiers = self.filter_dataframe(df, text) + df = df.loc[identifiers] + identifiers = df.index.to_list() return identifiers def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: int = 1000) -> int: @@ -652,22 +653,20 @@ def fuzzy_search(self, text: str) -> list: return sorted_identifiers def search(self, text) -> list: - """Search the dataframe on this text, return a sorted list of identifiers""" - - literal_identifiers = self.literal_search(text) + """Search the dataframe on this text, return a sorted list of identifiers.""" fuzzy_identifiers = self.fuzzy_search(text) - - ordered_literal_identifiers = [] - other_identifiers = [] - - # add all fuzzy identifiers to one of two lists, depending on whether they were found in literal search or not - # this guarantees we put the literal matches on top, but still sort within this group based on fuzzy scores - for identifier in fuzzy_identifiers: - if identifier in literal_identifiers: - ordered_literal_identifiers.append(identifier) - else: - other_identifiers.append(identifier) - - identifiers = ordered_literal_identifiers + other_identifiers + if len(fuzzy_identifiers) == 0: + return [] + + # take the fuzzy search sub-set of data and search it literally + df = self.df.loc[fuzzy_identifiers].copy() + literal_identifiers = self.literal_search(text, df) + if len(literal_identifiers) == 0: + return fuzzy_identifiers + + # append any fuzzy identifiers that were not found in the literal search + fuzzy_identifiers = [ + _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] + identifiers = literal_identifiers + fuzzy_identifiers return identifiers From 295995e4cbebdcc548185b64648364c2d03f16d6 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 24 Jul 2025 16:09:05 +0200 Subject: [PATCH 05/47] Add basic logging to SearchEngine --- .../bwutils/search/searchengine.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index d0a722b22..8b097cc84 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -2,12 +2,17 @@ import itertools import functools from collections import Counter, OrderedDict +from logging import getLogger +from time import time from typing import Iterable, Optional import pandas as pd import numpy as np import re +log = getLogger(__name__) + + class SearchEngine: """ A Search Engine class, takes a dataframe and makes it searchable. @@ -33,7 +38,7 @@ class SearchEngine: """ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: list = []): - + t = time() # compile regex patterns for cleaning self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]") # for replacing with empty string self.SPACE_PATTERN = re.compile(r"[-−:;]") # for replacing with space @@ -70,7 +75,7 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l # find the self.identifier_name column index and store as int self.identifier_column = self.columns.index(self.identifier_name) - # store all searchable columns except the identifier + # store all searchable column indices except the identifier self.regular_columns = [i for i in range(len(self.columns)) if i != self.identifier_column] # initialize search index dicts and update df @@ -81,6 +86,7 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l self.df = pd.DataFrame() self.update_index(df) + log.debug(f"Search engine initialized in {time() - t:.2f} seconds for {len(self.df)} items") # +++ Utility functions @@ -652,16 +658,21 @@ def fuzzy_search(self, text: str) -> list: sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] return sorted_identifiers - def search(self, text) -> list: + def search(self, text, col_modifiers: Optional[dict] = None) -> list: """Search the dataframe on this text, return a sorted list of identifiers.""" + t = time() fuzzy_identifiers = self.fuzzy_search(text) if len(fuzzy_identifiers) == 0: + log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") return [] # take the fuzzy search sub-set of data and search it literally df = self.df.loc[fuzzy_identifiers].copy() + literal_identifiers = self.literal_search(text, df) if len(literal_identifiers) == 0: + log.debug( + f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") return fuzzy_identifiers # append any fuzzy identifiers that were not found in the literal search @@ -669,4 +680,6 @@ def search(self, text) -> list: _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] identifiers = literal_identifiers + fuzzy_identifiers + log.debug( + f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") return identifiers From 79754ca7659e41ba69f2ce103b0e6ee8662821a1 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 24 Jul 2025 16:24:34 +0200 Subject: [PATCH 06/47] . --- activity_browser/bwutils/search/searchengine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index 8b097cc84..86e56f29e 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -658,7 +658,7 @@ def fuzzy_search(self, text: str) -> list: sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] return sorted_identifiers - def search(self, text, col_modifiers: Optional[dict] = None) -> list: + def search(self, text) -> list: """Search the dataframe on this text, return a sorted list of identifiers.""" t = time() fuzzy_identifiers = self.fuzzy_search(text) From 6bd39f71af2367d1fc1bfea115012d75bee95402 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Mon, 28 Jul 2025 13:48:07 +0200 Subject: [PATCH 07/47] Base implementation of metadata specific class --- activity_browser/bwutils/__init__.py | 2 +- activity_browser/bwutils/metadata.py | 14 +- activity_browser/bwutils/search/__init__.py | 2 +- .../bwutils/search/searchengine.py | 185 +++++++++++++++++- 4 files changed, 197 insertions(+), 6 deletions(-) diff --git a/activity_browser/bwutils/__init__.py b/activity_browser/bwutils/__init__.py index 4bb75c292..12b565e61 100644 --- a/activity_browser/bwutils/__init__.py +++ b/activity_browser/bwutils/__init__.py @@ -12,7 +12,7 @@ from .montecarlo import MonteCarloLCA from .multilca import MLCA, Contributions from .pedigree import PedigreeMatrix -from .search import SearchEngine +from .search import SearchEngine, MetaDataSearchEngine from .sensitivity_analysis import GlobalSensitivityAnalysis from .superstructure import SuperstructureContributions, SuperstructureMLCA from .uncertainty import (CFUncertaintyInterface, ExchangeUncertaintyInterface, diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index 9790e7885..dc7e7328a 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -7,8 +7,6 @@ from typing import Set from logging import getLogger -from playhouse.shortcuts import model_to_dict - import pandas as pd from qtpy.QtCore import Qt, QObject, Signal, SignalInstance @@ -17,6 +15,8 @@ from bw2data.errors import UnknownObject from bw2data.backends import sqlite3_lci_db, ActivityDataset +from activity_browser.bwutils.search import MetaDataSearchEngine + from activity_browser import signals @@ -190,6 +190,7 @@ def sync(self) -> None: con.close() self.dataframe = self._parse_df(node_df) + self.init_search() # init search index self.synced.emit() @@ -333,5 +334,14 @@ def _unpacker(self, classifications: list, system: str) -> list: system_classifications.append(result) # result is either "" or the classification return system_classifications + def init_search(self): + allowed_cols = [ + "id", "name", "synonyms", "unit", "key", "database", # generic + "CAS number", "categories", # biosphere specific + "product", "reference product", "classifications", "location", "properties" # activity specific + ] + + MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols) + AB_metadata = MetaDataStore() diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py index 042045a6b..85e30c9be 100644 --- a/activity_browser/bwutils/search/__init__.py +++ b/activity_browser/bwutils/search/__init__.py @@ -1 +1 @@ -from .searchengine import SearchEngine \ No newline at end of file +from .searchengine import SearchEngine, MetaDataSearchEngine \ No newline at end of file diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index 86e56f29e..f1a32f4a1 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -676,9 +676,190 @@ def search(self, text) -> list: return fuzzy_identifiers # append any fuzzy identifiers that were not found in the literal search - fuzzy_identifiers = [ + remaining_fuzzy_identifiers = [ _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] - identifiers = literal_identifiers + fuzzy_identifiers + identifiers = literal_identifiers + remaining_fuzzy_identifiers + + log.debug( + f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") + return identifiers + + +class MetaDataSearchEngine(SearchEngine): + def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: + """Overwritten for extra database specific reduction of results. + """ + n_q_grams = len(q_grams) + + matches = {} + + # find words that match our q-grams + for q_gram in q_grams: + if words := self.q_gram_to_word.get(q_gram, False): + # q_gram exists in our search index + for word in words: + if isinstance(self.database_ids, set): + # DATABASE SPECIFIC now filter on whether word is in the database + in_db = False + for _id in self.word_to_identifier[word]: + if _id in self.database_ids: + in_db = True + break + else: + in_db = True + if in_db: + matches[word] = matches.get(word, 0) + words[word] + + # if we find no results, return an empty dataframe + if len(matches) == 0: + return pd.DataFrame({"word": [], "matches": []}) + + # otherwise, create a dataframe and + # reduce search results to most relevant results + matches = {"word": matches.keys(), "matches": matches.values()} + matches = pd.DataFrame(matches) + max_q = max(matches["matches"]) # this has the most matching q-grams + + # determine how many results we want to keep based on how good our results are + min_q = max(max_q * 0.32, # have at least a third of q-grams of best match or... + max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? + 1)) # okay just do 1 q-gram if there are no more in the word + + matches = matches[matches["matches"] >= min_q] + matches = matches.sort_values(by="matches", ascending=False) + matches = matches.reset_index(drop=True) + + return matches.iloc[:min(len(matches), 2500), :] # return at most this many results + + def fuzzy_search(self, text: str) -> list: + """Overwritten for extra database specific reduction of results. + """ + queries = self.build_queries(text) + + # make list of unique original words + orig_words = OrderedDict() + for word in text.split(" "): + orig_words[word] = False + orig_words = orig_words.keys() + orig_words = [self.clean_text(word) for word in orig_words] + + # order the queries by the amount of words they contain + # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space + queries_by_size = OrderedDict() + longest_query = max([len(q) for q in queries]) + for query_len in range(1, longest_query + 1): + queries_by_size[query_len] = [q for q in queries if len(q) == query_len] + + # first handle queries of length 1 + query_to_identifier = self.search_size_1(queries_by_size[1], orig_words) + + # DATABASE SPECIFIC ensure all identifiers are in the database + if isinstance(self.database_ids, set): + new_q2i = {} + for word, _ids in query_to_identifier.items(): + keep = set.intersection(set(_ids.keys()), self.database_ids) + new_id_counter = Counter() + for _id in keep: + new_id_counter[_id] = _ids[_id] + if len(new_id_counter) > 0: + new_q2i[word] = new_id_counter + query_to_identifier = new_q2i + + # get all results into a df, we rank further later + all_identifiers = set() + for id_list in [id_list for id_list in query_to_identifier.values()]: + all_identifiers.update(id_list) + search_df = self.df.loc[list(all_identifiers)] + + # now, we search for combinations of query words and get only those identifiers + # we then reduce de search_df further for only those matching identifiers + # we then search the permutations of that set of words + for q_len, query_set in queries_by_size.items(): + if q_len == 1: + # we already did these above + continue + for query in query_set: + + # get the intersection of all identifiers + # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query + # this ensures we only ever search data where ALL items occur to substantially reduce search-space + # finally, make this a Counter (with each item=1) so we can properly weigh things later + query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if + query_to_identifier.get(q_word, False)]) + if len(query_identifier_set) == 0: + # there is no match for this combination of query words, skip + break + + # now we convert the query identifiers to a Counter of 'occurrence', + # where we weigh queries with only original words higher + query_identifiers = Counter() + for identifier in query_identifier_set: + weight = 0 + for query_word in query: + weight += query_to_identifier[query_word][identifier] + + query_identifiers[identifier] = weight + + # we now add these identifiers to a counter for this query name, + query_name = " ".join(query) + + weight = self.base_weight * q_len + query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter()) + + # now search for all permutations of this query combined with a space + query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)] + for query_perm in permutations(query): + mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"]) + new_df = query_df.loc[mask].reset_index(drop=True) + if len(new_df) == 0: + # there is no match for this permutation of words, skip + continue + new_id_list = new_df[self.identifier_name] + + new_ids = Counter() + for new_id in new_id_list: + new_ids[new_id] = query_identifiers[new_id] + + # we weigh a combination of words that is next also to each other even higher than just the words separately + query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight, + query_to_identifier[query_name]) + # now finally, move to one object sorted list by highest score + all_identifiers = Counter() + for identifiers in query_to_identifier.values(): + all_identifiers += identifiers + + # now sort on highest weights and make list type + sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] + return sorted_identifiers + + def search(self, text, database: Optional[str] = None) -> list: + """Search the dataframe on this text, return a sorted list of identifiers.""" + t = time() + + # get the set of ids that is in this database + if database is not None: + self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) + else: + self.database_ids = None + + fuzzy_identifiers = self.fuzzy_search(text) + if len(fuzzy_identifiers) == 0: + log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") + return [] + + # take the fuzzy search sub-set of data and search it literally + df = self.df.loc[fuzzy_identifiers].copy() + + literal_identifiers = self.literal_search(text, df) + if len(literal_identifiers) == 0: + log.debug( + f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") + return fuzzy_identifiers + + # append any fuzzy identifiers that were not found in the literal search + remaining_fuzzy_identifiers = [ + _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] + identifiers = literal_identifiers + remaining_fuzzy_identifiers log.debug( f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") From 91a3328bfd732568e43029433a3e4adfff639171 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Mon, 18 Aug 2025 10:04:55 +0200 Subject: [PATCH 08/47] minor changes to searchengine --- .../bwutils/search/searchengine.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index f1a32f4a1..cbb088afd 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -195,19 +195,19 @@ def list_to_q_grams(self, word_list: Iterable) -> dict: # +++ Changes to searchable data - def add_identifier(self, identifier, data: dict) -> None: + def add_identifier(self, data: dict, make_searchable=[]) -> None: """Add this identifier to the search index. identifier is expected to be a unique identifier that has not been used before data is expected to be a dict of column names and data """ + #TODO add ability to add new columns with make_searchable + identifier = data[self.identifier_name] + # make sure we the identifier does not yet exist if identifier in self.df.index.to_list(): raise Exception( f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.") - if data[self.identifier_name] != identifier: - raise Exception( - f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.") df_cols = self.columns @@ -273,9 +273,12 @@ def change_identifier(self, identifier, data: dict) -> None: if identifier not in self.df.index.to_list(): raise Exception( f"Identifier '{identifier}' does not exist in the search data, use an existing identifier or use the add_identifier function.") - if data[self.identifier_name] != identifier: + if self.identifier_name in data.keys() and data[self.identifier_name] != identifier: raise Exception( - f"Identifier argument '{identifier}' and data in identifier column '{data[self.identifier_name]}' must be the same.") + "Identifier field cannot be changed, first remove item and then add new identifier") + if "query_col" in data.keys(): + log.debug( + f"Field 'query_col' is a protected field for search engine and will be ignored for changing {identifier}") # get existing data update_data = {col: self.df.loc[identifier, col] for col in self.df.columns} @@ -289,7 +292,7 @@ def change_identifier(self, identifier, data: dict) -> None: self.remove_identifier(identifier) # add entry with new data - self.add_identifier(identifier, update_data) + self.add_identifier(update_data) # +++ Search @@ -661,6 +664,11 @@ def fuzzy_search(self, text: str) -> list: def search(self, text) -> list: """Search the dataframe on this text, return a sorted list of identifiers.""" t = time() + + if len(text) == 0: + log.debug(f"Empty search, returned all items") + return self.df.index.to_list() + fuzzy_identifiers = self.fuzzy_search(text) if len(fuzzy_identifiers) == 0: log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") @@ -836,6 +844,10 @@ def search(self, text, database: Optional[str] = None) -> list: """Search the dataframe on this text, return a sorted list of identifiers.""" t = time() + if len(text) == 0: + log.debug(f"Empty search, returned all items") + return self.df.index.to_list() + # get the set of ids that is in this database if database is not None: self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) From 8ec0cf401527b4d5e070662a54977fbc53493f16 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 2 Sep 2025 12:46:40 +0200 Subject: [PATCH 09/47] - Solve bug in OSA distance for early stopping with long similar strings - Add and improve tests --- .../bwutils/search/searchengine.py | 92 ++++++++++---- tests/test_search.py | 117 +++++++++++++----- 2 files changed, 151 insertions(+), 58 deletions(-) diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index cbb088afd..913242a3a 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -39,6 +39,8 @@ class SearchEngine: def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: list = []): t = time() + log.debug(f"SearchEngine initializing for {len(df)} items") + # compile regex patterns for cleaning self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]") # for replacing with empty string self.SPACE_PATTERN = re.compile(r"[-−:;]") # for replacing with space @@ -86,7 +88,8 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l self.df = pd.DataFrame() self.update_index(df) - log.debug(f"Search engine initialized in {time() - t:.2f} seconds for {len(self.df)} items") + + log.debug(f"SearchEngine Initialized in {time() - t:.2f} seconds") # +++ Utility functions @@ -102,6 +105,9 @@ def update_dict(update_me: dict, new: dict) -> dict: update_me[dict_key] = _counter return update_me + t = time() + size_old = len(self.df) + # identifier to word and df i2w, update_df = self.words_in_df(update_df) self.identifier_to_word = update_dict(self.identifier_to_word, i2w) @@ -119,6 +125,13 @@ def update_dict(update_me: dict, new: dict) -> dict: q2w = self.reverse_dict_many_to_one(w2q) self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w) + size_new = len(self.df) + size_dif = size_new - size_old + size_msg = (f"{size_dif} changed items at {round(size_dif/(time() - t), 0)} items/sec " + f"({size_new} items currently)") if size_dif > 1 \ + else f"1 changed item ({size_new} items currently)" + log.debug(f"Search index updated in {time() - t:.2f} seconds for {size_msg}.") + def clean_text(self, text: str): """Clean a string so it doesn't contain weird characters or multiple spaces etc.""" text = self.SUB_PATTERN.sub("", text.lower()) @@ -193,6 +206,13 @@ def list_to_q_grams(self, word_list: Iterable) -> dict: return q_gram_dict + def word_in_index(self, word: str) -> bool: + """Convenience function to check if a single word is in the search index.""" + if " " in word: + raise Exception( + f"Given word '{word}' must not contain spaces.") + return word in self.word_to_identifier.keys() + # +++ Changes to searchable data def add_identifier(self, data: dict, make_searchable=[]) -> None: @@ -228,9 +248,12 @@ def add_identifier(self, data: dict, make_searchable=[]) -> None: # update the search index data self.update_index(new_df) - def remove_identifier(self, identifier) -> None: + def remove_identifier(self, identifier, logging=True) -> None: """Remove this identifier from self.df and the search index. """ + if logging: + t = time() + # make sure the identifier exists if identifier not in self.df.index.to_list(): raise Exception( @@ -261,6 +284,10 @@ def remove_identifier(self, identifier) -> None: # finally, remove the identifier del self.identifier_to_word[identifier] + if logging: + log.debug(f"Search index updated in {time() - t:.2f} seconds " + f"for 1 removed item ({len(self.df)} items currently).") + def change_identifier(self, identifier, data: dict) -> None: """Change this identifier. @@ -289,7 +316,7 @@ def change_identifier(self, identifier, data: dict) -> None: update_data[field] = value # remove the entry - self.remove_identifier(identifier) + self.remove_identifier(identifier, logging=False) # add entry with new data self.add_identifier(update_data) @@ -380,7 +407,7 @@ def osa_distance(self, word1: str, word2: str, cutoff: int = 0, cutoff_return: i distance[i][j] = min(distance[i][j], transposition) # stop early if we surpass cutoff - if 0 < cutoff <= distance[i][j]: + if 0 < cutoff <= min(distance[i]): return cutoff_return return distance[i][j] @@ -428,19 +455,24 @@ def spell_check(self, text: str) -> OrderedDict: We rank alternative words based on 1) edit distance 2) how often a word is used in an entry If too many results are found, we only keep edit distance 1, - if we want more results, we keep with longer edit distance up to ... - + if we want more results, we keep with longer edit distance up to `never_accept_this` word_results = OrderedDict( - "word": [word, work] + "word": [work] ) + NOTE: only ALTERNATIVES are ever returned, this function returns empty list for item BOTH when + 1) the exact word is in the data + 2) when there are no suitable alternatives """ + count_occurence = lambda x: sum(self.word_to_identifier[x].values()) # count occurences of a word + word_results = OrderedDict() - matches_goal = 3 # ideally we have at least this many alternatives - always_accept_this = 1 # values of this or lower always accepted - never_accept_this = 4 # values this or over always rejected + matches_min = 3 # ideally we have at least this many alternatives + matches_max = 10 # ideally don't much more than this many matches + always_accept_this = 1 # values of this edit distance or lower always accepted + never_accept_this = 4 # values this edit distance or over always rejected # make list of unique words words = OrderedDict() @@ -451,12 +483,12 @@ def spell_check(self, text: str) -> OrderedDict: words = [self.clean_text(word) for word in words] for word in words: - # first, find possible matches quickly q_grams = self.text_to_positional_q_gram(word) possible_matches = self.find_q_gram_matches(set(q_grams)) matches = [] + first_matches = Counter() other_matches = {} # now, refine with edit distance @@ -467,23 +499,33 @@ def spell_check(self, text: str) -> OrderedDict: if edit_distance == 0: continue # we are looking for alternatives only, not the exact word elif edit_distance <= always_accept_this: - matches.append(row[1]) + first_matches[row[1]] = count_occurence(row[1]) elif edit_distance < never_accept_this: - if other_matches.get(edit_distance): - other_matches[edit_distance].append(row[1]) - else: - other_matches[edit_distance] = [row[1]] + if not other_matches.get(edit_distance): + other_matches[edit_distance] = Counter() + other_matches[edit_distance][row[1]] = count_occurence(row[1]) + else: + continue + # add matches in correct order: + for match, _ in first_matches.most_common(): + matches.append(match) # if we have fewer matches than goal, add more 'less good' matches - if len(matches) < matches_goal: + if len(matches) < matches_min: for i in range(always_accept_this + 1, never_accept_this): - # iteratively increase 'worse' matches so we hit goal of minimum alternatives + # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives if new := other_matches.get(i): - matches = matches + new - if len(matches) >= matches_goal: - break - word_results[word] = matches + prev_num = 10e100 + for match, num in new.most_common(): + if num == prev_num: + matches.append(match) + elif num != prev_num and len(matches <= matches_max): + matches.append(match) + else: + break + prev_num = num + word_results[word] = matches return word_results def build_queries(self, query_text) -> list: @@ -515,7 +557,7 @@ def weigh_identifiers(self, identifiers: Counter, weight: int, weighted_ids: Cou weighted_ids[identifier] += (weight * occurrences) return weighted_ids - def search_size_1(self, queries: list, original_words: list, orig_word_weight=5, exact_word_weight=1) -> dict: + def search_size_1(self, queries: list, original_words: set, orig_word_weight=5, exact_word_weight=1) -> dict: """Return a dict of {query_word: Counter(identifier)}. queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word @@ -582,7 +624,7 @@ def fuzzy_search(self, text: str) -> list: for word in text.split(" "): orig_words[word] = False orig_words = orig_words.keys() - orig_words = [self.clean_text(word) for word in orig_words] + orig_words = {self.clean_text(word) for word in orig_words} # order the queries by the amount of words they contain # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space @@ -749,7 +791,7 @@ def fuzzy_search(self, text: str) -> list: for word in text.split(" "): orig_words[word] = False orig_words = orig_words.keys() - orig_words = [self.clean_text(word) for word in orig_words] + orig_words = {self.clean_text(word) for word in orig_words} # order the queries by the amount of words they contain # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space diff --git a/tests/test_search.py b/tests/test_search.py index 231859d28..036e6c864 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -17,6 +17,7 @@ def data_for_test(): columns = ["id", "col1", "col2"]) +# test standard init def test_search_init(): """Do initialization tests.""" df = data_for_test() @@ -33,8 +34,90 @@ def test_search_init(): se = SearchEngine(df, identifier_name="id") +# test internals +def test_reverse_dict(): + """Do test to reverse the special Counter dict.""" + df = data_for_test() + se = SearchEngine(df, identifier_name="id") + + # reverse once and verify + w2i = se.reverse_dict_many_to_one(se.identifier_to_word) + assert w2i == se.word_to_identifier + + # reverse same and verify is same as input + i2w = se.reverse_dict_many_to_one(w2i) + assert i2w == se.identifier_to_word + + +def test_string_distance(): + """Do tests specifically for string distance function.""" + df = data_for_test() + se = SearchEngine(df, identifier_name="id") + + # same word + assert se.osa_distance("coal", "coal") == 0 + # empty string is length of other word + assert se.osa_distance("coal", "") == 4 + + # insert + assert se.osa_distance("coal", "coa") == 1 + # delete + assert se.osa_distance("coal", "coall") == 1 + # substitute + assert se.osa_distance("coal", "coat") == 1 + # transpose + assert se.osa_distance("coal", "cola") == 1 + + # longer edit distance + assert se.osa_distance("coal", "chocolate") == 6 + # reverse order gives same result + assert se.osa_distance("coal", "chocolate") == se.osa_distance("chocolate", "coal") + # cutoff + assert se.osa_distance("coal", "chocolate", cutoff=5, cutoff_return=1000) == 1000 + assert se.osa_distance("coal", "chocolate", cutoff=6, cutoff_return=1000) == 1000 + assert se.osa_distance("coal", "chocolate", cutoff=7, cutoff_return=1000) == 6 + # length cutoff + assert se.osa_distance("coal", "coallongword") == 8 + assert se.osa_distance("coal", "coallongword", cutoff=5, cutoff_return=1000) == 1000 + + # two entirely different words (test of early stopping) + assert se.osa_distance("brown", "jumped") == 6 + assert se.osa_distance("brown", "jumped", cutoff=6, cutoff_return=1000) == 1000 + assert se.osa_distance("brown", "jumped", cutoff=7, cutoff_return=1000) == 6 + + +# test functionality +def test_in_index(): + """Do checks for checking if word is in the index.""" + df = data_for_test() + se = SearchEngine(df, identifier_name="id") + + # use string with space + with pytest.raises(Exception): + se.word_in_index("coal and space") + + assert se.word_in_index("coal") + assert not se.word_in_index("coa") + + +def test_spellcheck(): + """Do checks spell checking.""" + df = data_for_test() + se = SearchEngine(df, identifier_name="id") + + checked = se.spell_check("coa productions something flintstones") + # coal HAS to be first, it is found more often in the data + assert checked["coa"] == ["coal", "coat"] + # find production + assert checked["productions"] == ["production"] + # should be empty as there is no alternative (but this word occurs) + assert checked["something"] == [] + # should be empty as there is no alternative (does not exist) + assert checked["flintstones"] == [] + + def test_search_base(): - """Do checks for search ranking.""" + """Do checks for correct search ranking.""" df = data_for_test() @@ -147,35 +230,3 @@ def test_search_change_identifier(): } se.change_identifier(identifier="a", data=new_edit_data) assert se.search("coal production") == ["c", "b", "d", "h", "a", "f", "g"] - - -def test_string_distance(): - """Do tests specifically for string distance function""" - df = data_for_test() - se = SearchEngine(df, identifier_name="id") - - # same word - assert se.osa_distance("coal", "coal") == 0 - # empty string is length of other word - assert se.osa_distance("coal", "") == 4 - - # insert - assert se.osa_distance("coal", "coa") == 1 - # delete - assert se.osa_distance("coal", "coall") == 1 - # substitute - assert se.osa_distance("coal", "coat") == 1 - # transpose - assert se.osa_distance("coal", "cola") == 1 - - # longer edit distance - assert se.osa_distance("coal", "chocolate") == 6 - # reverse order gives same result - assert se.osa_distance("coal", "chocolate") == se.osa_distance("chocolate", "coal") - # cutoff - assert se.osa_distance("coal", "chocolate", cutoff=5, cutoff_return=1000) == 1000 - assert se.osa_distance("coal", "chocolate", cutoff=6, cutoff_return=1000) == 1000 - assert se.osa_distance("coal", "chocolate", cutoff=7, cutoff_return=1000) == 6 - # length cutoff - assert se.osa_distance("coal", "coallongword") == 8 - assert se.osa_distance("coal", "coallongword", cutoff=5, cutoff_return=1000) == 1000 From e2bb1cf6f732b469e3558fb484982ba961cbb874 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 11:37:31 +0200 Subject: [PATCH 10/47] update add/change identifier (and tests) to accept dataframes instead of dicts --- .../bwutils/search/searchengine.py | 91 +++++++++++-------- tests/test_search.py | 56 ++++++------ 2 files changed, 80 insertions(+), 67 deletions(-) diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/search/searchengine.py index 913242a3a..aaf090eb6 100644 --- a/activity_browser/bwutils/search/searchengine.py +++ b/activity_browser/bwutils/search/searchengine.py @@ -78,7 +78,7 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l self.identifier_column = self.columns.index(self.identifier_name) # store all searchable column indices except the identifier - self.regular_columns = [i for i in range(len(self.columns)) if i != self.identifier_column] + self.searchable_columns = [i for i in range(len(self.columns)) if i != self.identifier_column] # initialize search index dicts and update df self.identifier_to_word = {} @@ -112,6 +112,7 @@ def update_dict(update_me: dict, new: dict) -> dict: i2w, update_df = self.words_in_df(update_df) self.identifier_to_word = update_dict(self.identifier_to_word, i2w) self.df = pd.concat([self.df, update_df]) + self.df = self.df.fillna("") # ensure we don't add unwanted NA through concatenations # word to identifier w2i = self.reverse_dict_many_to_one(i2w) @@ -163,7 +164,7 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: df = df if any(df) else self.df return_df = df.copy() - df = df.iloc[:, self.regular_columns] + df = df.iloc[:, self.searchable_columns] identifier_word_dict = {} col = [] @@ -215,38 +216,47 @@ def word_in_index(self, word: str) -> bool: # +++ Changes to searchable data - def add_identifier(self, data: dict, make_searchable=[]) -> None: - """Add this identifier to the search index. + def add_identifier(self, data: pd.DataFrame) -> None: + """Add this data to the search index. - identifier is expected to be a unique identifier that has not been used before - data is expected to be a dict of column names and data + identifier column is REQUIRED to be present + ALL data in the given dataframe will be added, if columns should not be added, they should be removed before + calling this function """ - #TODO add ability to add new columns with make_searchable - identifier = data[self.identifier_name] - # make sure we the identifier does not yet exist - if identifier in self.df.index.to_list(): + # ensure we have identifier column + if self.identifier_name not in data.columns: raise Exception( - f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.") - - df_cols = self.columns + f"Identifier column '{self.identifier_name}' not in new data, impossible to add data without identifier") - # drop fields that are not in self.df - drop = [col for col in data if col not in df_cols] - for field in drop: - del data[field] + # make sure we the identifier does not yet exist + existing_ids = set(self.df.index.to_list()) + for identifier in data[self.identifier_name]: + if identifier in existing_ids: + raise Exception( + f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.") - # add empty field for missing data + df_cols = self.columns + # add cols to new data that are missing for col in df_cols: - if col not in data: - data[col] = "" - - # convert to df - new_df = pd.DataFrame(data, index=[identifier]) - new_df = new_df.astype(str) + if col not in data.columns: + data[col] = [""] * len(data) + # re-order cols, first existing, then new + new_cols = [col for col in data.columns if col not in self.columns if col not in set(df_cols)] + data_cols = df_cols + new_cols + data = data[data_cols] # re-order new data to be in correct order + + # add cols from new data to correct places + self.columns.extend(new_cols) + self.searchable_columns.extend([i for i, col in enumerate(data_cols) if col in new_cols]) + + # convert df + data = data.set_index(self.identifier_name, drop=False) + data = data.fillna("") + data = data.astype(str) # update the search index data - self.update_index(new_df) + self.update_index(data) def remove_identifier(self, identifier, logging=True) -> None: """Remove this identifier from self.df and the search index. @@ -288,37 +298,40 @@ def remove_identifier(self, identifier, logging=True) -> None: log.debug(f"Search index updated in {time() - t:.2f} seconds " f"for 1 removed item ({len(self.df)} items currently).") - def change_identifier(self, identifier, data: dict) -> None: + def change_identifier(self, identifier, data: pd.DataFrame) -> None: """Change this identifier. - identifier is expected to be a unique identifier that is in use - data is expected to be a dict of column names and data that change - - only changed data needs to be supplied + identifier must be an identifier that is in use + data must be a dataframe of 1 row with all change data + data is overwritten with the new data in 'data', columns not given remain unchanged """ - # make sure the identifier exists + + # make sure only 1 change item is given + if len(data) > 1 or len(data) < 1: + raise Exception( + f"change data must be for exactly 1 identifier, but {len(data)} items were given.") + # make sure correct use of identifier if identifier not in self.df.index.to_list(): raise Exception( f"Identifier '{identifier}' does not exist in the search data, use an existing identifier or use the add_identifier function.") - if self.identifier_name in data.keys() and data[self.identifier_name] != identifier: + if self.identifier_name in data.columns and data[self.identifier_name].to_list() != [identifier]: raise Exception( "Identifier field cannot be changed, first remove item and then add new identifier") if "query_col" in data.keys(): log.debug( f"Field 'query_col' is a protected field for search engine and will be ignored for changing {identifier}") - # get existing data - update_data = {col: self.df.loc[identifier, col] for col in self.df.columns} - del update_data["query_col"] # overwrite new data where relevant - for field, value in data.items(): - update_data[field] = value + update_data = self.df.loc[[identifier], self.columns] + data = data.reset_index(drop=True) + for col in data.columns: + value = data.loc[0, col] + update_data[col] = [value] # remove the entry self.remove_identifier(identifier, logging=False) - - # add entry with new data + # add entry with updated data self.add_identifier(update_data) # +++ Search diff --git a/tests/test_search.py b/tests/test_search.py index 036e6c864..620c5c3b8 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -138,42 +138,43 @@ def test_search_add_identifier(): df = data_for_test() # create base item to add - new_base_item = { - "id": "i", - "col1": "coal production", - "col2": "coal production" - } + new_base_item = pd.DataFrame([ + ["i", "coal production", "coal production"], + ], + columns=["id", "col1", "col2"]) - # use mismatched identifier and fail + # use existing identifier and fail se = SearchEngine(df, identifier_name="id") + wrong_id = new_base_item.copy() + wrong_id.iloc[0, 0] = "a" with pytest.raises(Exception): - se.add_identifier(identifier="j", data=new_base_item) + se.add_identifier(wrong_id) - # use existing identifier and fail + # add data without identifier column se = SearchEngine(df, identifier_name="id") - wrong_id = new_base_item.copy() - wrong_id["id"] = "a" + no_id = new_base_item.copy() + del no_id["id"] with pytest.raises(Exception): - se.add_identifier(identifier="a", data=wrong_id) + se.add_identifier(no_id) - # use column too many (should be removed) + # use column more (and find data in new col) se = SearchEngine(df, identifier_name="id") col_more = new_base_item.copy() - col_more["col3"] = "word" - se.add_identifier(identifier="i", data=col_more) - assert "col3" not in se.df.columns + col_more["col3"] = ["potatoes"] + se.add_identifier(col_more) + assert se.search("potatoes") == ["i"] # use column less (should be filled with empty string) se = SearchEngine(df, identifier_name="id") col_less = new_base_item.copy() del col_less["col2"] - se.add_identifier(identifier="i", data=col_less) + se.add_identifier(col_less) assert se.df.loc["i", "col2"] == "" # do search, add item and verify results are different se = SearchEngine(df, identifier_name="id") assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"] - se.add_identifier(identifier="i", data=new_base_item) + se.add_identifier(new_base_item) assert se.search("coal production") == ["i", "a", "c", "b", "d", "h", "f", "g"] @@ -198,23 +199,22 @@ def test_search_change_identifier(): df = data_for_test() # create base item to add - edit_data = { - "id": "a", - "col1": "cant find me anymore", - "col2": "something different" - } + edit_data = pd.DataFrame([ + ["a", "cant find me anymore", "something different"], + ], + columns=["id", "col1", "col2"]) # use non-existent identifier and fail se = SearchEngine(df, identifier_name="id") missing_id = edit_data.copy() - missing_id["id"] = "i" + missing_id["id"] = ["i"] with pytest.raises(Exception): se.change_identifier(identifier="i", data=missing_id) # use mismatched identifier and fail se = SearchEngine(df, identifier_name="id") wrong_id = edit_data.copy() - wrong_id["id"] = "i" + wrong_id["id"] = ["i"] with pytest.raises(Exception): se.change_identifier(identifier="a", data=wrong_id) @@ -224,9 +224,9 @@ def test_search_change_identifier(): se.change_identifier(identifier="a", data=edit_data) assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"] # now change the same item partially and verify results are different - new_edit_data = { - "id": "a", - "col1": "coal" - } + new_edit_data = pd.DataFrame([ + ["a", "coal"], + ], + columns=["id", "col1"]) se.change_identifier(identifier="a", data=new_edit_data) assert se.search("coal production") == ["c", "b", "d", "h", "a", "f", "g"] From 2d6ca0f94659967808cafa0c062a881875b99ab2 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 11:38:33 +0200 Subject: [PATCH 11/47] update add/change identifier (and tests) to accept dataframes instead of dicts --- tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_search.py b/tests/test_search.py index 620c5c3b8..52199a64e 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -44,7 +44,7 @@ def test_reverse_dict(): w2i = se.reverse_dict_many_to_one(se.identifier_to_word) assert w2i == se.word_to_identifier - # reverse same and verify is same as input + # reverse again and verify is same as original i2w = se.reverse_dict_many_to_one(w2i) assert i2w == se.identifier_to_word From 04053ab82de07d96d49f15a1407015d7b5a43ecd Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 11:40:20 +0200 Subject: [PATCH 12/47] move searchengine.py to bwutils instead of subfolder --- activity_browser/bwutils/search/__init__.py | 1 - activity_browser/bwutils/{search => }/searchengine.py | 0 2 files changed, 1 deletion(-) delete mode 100644 activity_browser/bwutils/search/__init__.py rename activity_browser/bwutils/{search => }/searchengine.py (100%) diff --git a/activity_browser/bwutils/search/__init__.py b/activity_browser/bwutils/search/__init__.py deleted file mode 100644 index 85e30c9be..000000000 --- a/activity_browser/bwutils/search/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .searchengine import SearchEngine, MetaDataSearchEngine \ No newline at end of file diff --git a/activity_browser/bwutils/search/searchengine.py b/activity_browser/bwutils/searchengine.py similarity index 100% rename from activity_browser/bwutils/search/searchengine.py rename to activity_browser/bwutils/searchengine.py From 478ed5dfb0680abd7f498517aab512dfab9a06d1 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 11:41:08 +0200 Subject: [PATCH 13/47] move searchengine.py to bwutils instead of subfolder --- activity_browser/bwutils/__init__.py | 2 +- tests/test_search.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/activity_browser/bwutils/__init__.py b/activity_browser/bwutils/__init__.py index 5e97b3a2f..18839d5ac 100644 --- a/activity_browser/bwutils/__init__.py +++ b/activity_browser/bwutils/__init__.py @@ -13,7 +13,7 @@ from .montecarlo import MonteCarloLCA from .multilca import MLCA, Contributions from .pedigree import PedigreeMatrix -from .search import SearchEngine, MetaDataSearchEngine +from .searchengine import SearchEngine, MetaDataSearchEngine from .sensitivity_analysis import GlobalSensitivityAnalysis from .superstructure import SuperstructureContributions, SuperstructureMLCA from .uncertainty import (CFUncertaintyInterface, ExchangeUncertaintyInterface, diff --git a/tests/test_search.py b/tests/test_search.py index 52199a64e..2bb038124 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -1,6 +1,6 @@ import pytest import pandas as pd -from activity_browser.bwutils.search import SearchEngine +from activity_browser.bwutils import SearchEngine def data_for_test(): From 1c1300728d439d2808251a382929430b93d6c40e Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 12:08:54 +0200 Subject: [PATCH 14/47] move searchengine files --- .../bwutils/searchengine/__init__.py | 2 + .../{searchengine.py => searchengine/base.py} | 185 ----------------- .../bwutils/searchengine/metadata_search.py | 196 ++++++++++++++++++ 3 files changed, 198 insertions(+), 185 deletions(-) create mode 100644 activity_browser/bwutils/searchengine/__init__.py rename activity_browser/bwutils/{searchengine.py => searchengine/base.py} (78%) create mode 100644 activity_browser/bwutils/searchengine/metadata_search.py diff --git a/activity_browser/bwutils/searchengine/__init__.py b/activity_browser/bwutils/searchengine/__init__.py new file mode 100644 index 000000000..7a7eae9c1 --- /dev/null +++ b/activity_browser/bwutils/searchengine/__init__.py @@ -0,0 +1,2 @@ +from base import SearchEngine +from metadata_search import MetaDataSearchEngine diff --git a/activity_browser/bwutils/searchengine.py b/activity_browser/bwutils/searchengine/base.py similarity index 78% rename from activity_browser/bwutils/searchengine.py rename to activity_browser/bwutils/searchengine/base.py index aaf090eb6..7f9d8158e 100644 --- a/activity_browser/bwutils/searchengine.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -746,188 +746,3 @@ def search(self, text) -> list: log.debug( f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") return identifiers - - -class MetaDataSearchEngine(SearchEngine): - def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: - """Overwritten for extra database specific reduction of results. - """ - n_q_grams = len(q_grams) - - matches = {} - - # find words that match our q-grams - for q_gram in q_grams: - if words := self.q_gram_to_word.get(q_gram, False): - # q_gram exists in our search index - for word in words: - if isinstance(self.database_ids, set): - # DATABASE SPECIFIC now filter on whether word is in the database - in_db = False - for _id in self.word_to_identifier[word]: - if _id in self.database_ids: - in_db = True - break - else: - in_db = True - if in_db: - matches[word] = matches.get(word, 0) + words[word] - - # if we find no results, return an empty dataframe - if len(matches) == 0: - return pd.DataFrame({"word": [], "matches": []}) - - # otherwise, create a dataframe and - # reduce search results to most relevant results - matches = {"word": matches.keys(), "matches": matches.values()} - matches = pd.DataFrame(matches) - max_q = max(matches["matches"]) # this has the most matching q-grams - - # determine how many results we want to keep based on how good our results are - min_q = max(max_q * 0.32, # have at least a third of q-grams of best match or... - max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? - 1)) # okay just do 1 q-gram if there are no more in the word - - matches = matches[matches["matches"] >= min_q] - matches = matches.sort_values(by="matches", ascending=False) - matches = matches.reset_index(drop=True) - - return matches.iloc[:min(len(matches), 2500), :] # return at most this many results - - def fuzzy_search(self, text: str) -> list: - """Overwritten for extra database specific reduction of results. - """ - queries = self.build_queries(text) - - # make list of unique original words - orig_words = OrderedDict() - for word in text.split(" "): - orig_words[word] = False - orig_words = orig_words.keys() - orig_words = {self.clean_text(word) for word in orig_words} - - # order the queries by the amount of words they contain - # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space - queries_by_size = OrderedDict() - longest_query = max([len(q) for q in queries]) - for query_len in range(1, longest_query + 1): - queries_by_size[query_len] = [q for q in queries if len(q) == query_len] - - # first handle queries of length 1 - query_to_identifier = self.search_size_1(queries_by_size[1], orig_words) - - # DATABASE SPECIFIC ensure all identifiers are in the database - if isinstance(self.database_ids, set): - new_q2i = {} - for word, _ids in query_to_identifier.items(): - keep = set.intersection(set(_ids.keys()), self.database_ids) - new_id_counter = Counter() - for _id in keep: - new_id_counter[_id] = _ids[_id] - if len(new_id_counter) > 0: - new_q2i[word] = new_id_counter - query_to_identifier = new_q2i - - # get all results into a df, we rank further later - all_identifiers = set() - for id_list in [id_list for id_list in query_to_identifier.values()]: - all_identifiers.update(id_list) - search_df = self.df.loc[list(all_identifiers)] - - # now, we search for combinations of query words and get only those identifiers - # we then reduce de search_df further for only those matching identifiers - # we then search the permutations of that set of words - for q_len, query_set in queries_by_size.items(): - if q_len == 1: - # we already did these above - continue - for query in query_set: - - # get the intersection of all identifiers - # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query - # this ensures we only ever search data where ALL items occur to substantially reduce search-space - # finally, make this a Counter (with each item=1) so we can properly weigh things later - query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if - query_to_identifier.get(q_word, False)]) - if len(query_identifier_set) == 0: - # there is no match for this combination of query words, skip - break - - # now we convert the query identifiers to a Counter of 'occurrence', - # where we weigh queries with only original words higher - query_identifiers = Counter() - for identifier in query_identifier_set: - weight = 0 - for query_word in query: - weight += query_to_identifier[query_word][identifier] - - query_identifiers[identifier] = weight - - # we now add these identifiers to a counter for this query name, - query_name = " ".join(query) - - weight = self.base_weight * q_len - query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter()) - - # now search for all permutations of this query combined with a space - query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)] - for query_perm in permutations(query): - mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"]) - new_df = query_df.loc[mask].reset_index(drop=True) - if len(new_df) == 0: - # there is no match for this permutation of words, skip - continue - new_id_list = new_df[self.identifier_name] - - new_ids = Counter() - for new_id in new_id_list: - new_ids[new_id] = query_identifiers[new_id] - - # we weigh a combination of words that is next also to each other even higher than just the words separately - query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight, - query_to_identifier[query_name]) - # now finally, move to one object sorted list by highest score - all_identifiers = Counter() - for identifiers in query_to_identifier.values(): - all_identifiers += identifiers - - # now sort on highest weights and make list type - sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] - return sorted_identifiers - - def search(self, text, database: Optional[str] = None) -> list: - """Search the dataframe on this text, return a sorted list of identifiers.""" - t = time() - - if len(text) == 0: - log.debug(f"Empty search, returned all items") - return self.df.index.to_list() - - # get the set of ids that is in this database - if database is not None: - self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) - else: - self.database_ids = None - - fuzzy_identifiers = self.fuzzy_search(text) - if len(fuzzy_identifiers) == 0: - log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") - return [] - - # take the fuzzy search sub-set of data and search it literally - df = self.df.loc[fuzzy_identifiers].copy() - - literal_identifiers = self.literal_search(text, df) - if len(literal_identifiers) == 0: - log.debug( - f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") - return fuzzy_identifiers - - # append any fuzzy identifiers that were not found in the literal search - remaining_fuzzy_identifiers = [ - _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] - identifiers = literal_identifiers + remaining_fuzzy_identifiers - - log.debug( - f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") - return identifiers diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py new file mode 100644 index 000000000..01a5f93aa --- /dev/null +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -0,0 +1,196 @@ +from itertools import permutations +from collections import Counter, OrderedDict +from logging import getLogger +from time import time +from typing import Optional +import pandas as pd + +from activity_browser.bwutils.searchengine import SearchEngine + + +log = getLogger(__name__) + + +class MetaDataSearchEngine(SearchEngine): + def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: + """Overwritten for extra database specific reduction of results. + """ + n_q_grams = len(q_grams) + + matches = {} + + # find words that match our q-grams + for q_gram in q_grams: + if words := self.q_gram_to_word.get(q_gram, False): + # q_gram exists in our search index + for word in words: + if isinstance(self.database_ids, set): + # DATABASE SPECIFIC now filter on whether word is in the database + in_db = False + for _id in self.word_to_identifier[word]: + if _id in self.database_ids: + in_db = True + break + else: + in_db = True + if in_db: + matches[word] = matches.get(word, 0) + words[word] + + # if we find no results, return an empty dataframe + if len(matches) == 0: + return pd.DataFrame({"word": [], "matches": []}) + + # otherwise, create a dataframe and + # reduce search results to most relevant results + matches = {"word": matches.keys(), "matches": matches.values()} + matches = pd.DataFrame(matches) + max_q = max(matches["matches"]) # this has the most matching q-grams + + # determine how many results we want to keep based on how good our results are + min_q = max(max_q * 0.32, # have at least a third of q-grams of best match or... + max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? + 1)) # okay just do 1 q-gram if there are no more in the word + + matches = matches[matches["matches"] >= min_q] + matches = matches.sort_values(by="matches", ascending=False) + matches = matches.reset_index(drop=True) + + return matches.iloc[:min(len(matches), 2500), :] # return at most this many results + + def fuzzy_search(self, text: str) -> list: + """Overwritten for extra database specific reduction of results. + """ + queries = self.build_queries(text) + + # make list of unique original words + orig_words = OrderedDict() + for word in text.split(" "): + orig_words[word] = False + orig_words = orig_words.keys() + orig_words = {self.clean_text(word) for word in orig_words} + + # order the queries by the amount of words they contain + # we do this because longer queries (more words) are harder to find, but we have many alternatives so we search in a smaller search space + queries_by_size = OrderedDict() + longest_query = max([len(q) for q in queries]) + for query_len in range(1, longest_query + 1): + queries_by_size[query_len] = [q for q in queries if len(q) == query_len] + + # first handle queries of length 1 + query_to_identifier = self.search_size_1(queries_by_size[1], orig_words) + + # DATABASE SPECIFIC ensure all identifiers are in the database + if isinstance(self.database_ids, set): + new_q2i = {} + for word, _ids in query_to_identifier.items(): + keep = set.intersection(set(_ids.keys()), self.database_ids) + new_id_counter = Counter() + for _id in keep: + new_id_counter[_id] = _ids[_id] + if len(new_id_counter) > 0: + new_q2i[word] = new_id_counter + query_to_identifier = new_q2i + + # get all results into a df, we rank further later + all_identifiers = set() + for id_list in [id_list for id_list in query_to_identifier.values()]: + all_identifiers.update(id_list) + search_df = self.df.loc[list(all_identifiers)] + + # now, we search for combinations of query words and get only those identifiers + # we then reduce de search_df further for only those matching identifiers + # we then search the permutations of that set of words + for q_len, query_set in queries_by_size.items(): + if q_len == 1: + # we already did these above + continue + for query in query_set: + + # get the intersection of all identifiers + # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query + # this ensures we only ever search data where ALL items occur to substantially reduce search-space + # finally, make this a Counter (with each item=1) so we can properly weigh things later + query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if + query_to_identifier.get(q_word, False)]) + if len(query_identifier_set) == 0: + # there is no match for this combination of query words, skip + break + + # now we convert the query identifiers to a Counter of 'occurrence', + # where we weigh queries with only original words higher + query_identifiers = Counter() + for identifier in query_identifier_set: + weight = 0 + for query_word in query: + weight += query_to_identifier[query_word][identifier] + + query_identifiers[identifier] = weight + + # we now add these identifiers to a counter for this query name, + query_name = " ".join(query) + + weight = self.base_weight * q_len + query_to_identifier[query_name] = self.weigh_identifiers(query_identifiers, weight, Counter()) + + # now search for all permutations of this query combined with a space + query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)] + for query_perm in permutations(query): + mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"]) + new_df = query_df.loc[mask].reset_index(drop=True) + if len(new_df) == 0: + # there is no match for this permutation of words, skip + continue + new_id_list = new_df[self.identifier_name] + + new_ids = Counter() + for new_id in new_id_list: + new_ids[new_id] = query_identifiers[new_id] + + # we weigh a combination of words that is next also to each other even higher than just the words separately + query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight, + query_to_identifier[query_name]) + # now finally, move to one object sorted list by highest score + all_identifiers = Counter() + for identifiers in query_to_identifier.values(): + all_identifiers += identifiers + + # now sort on highest weights and make list type + sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] + return sorted_identifiers + + def search(self, text, database: Optional[str] = None) -> list: + """Search the dataframe on this text, return a sorted list of identifiers.""" + t = time() + + if len(text) == 0: + log.debug(f"Empty search, returned all items") + return self.df.index.to_list() + + # get the set of ids that is in this database + if database is not None: + self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) + else: + self.database_ids = None + + fuzzy_identifiers = self.fuzzy_search(text) + if len(fuzzy_identifiers) == 0: + log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") + return [] + + # take the fuzzy search sub-set of data and search it literally + df = self.df.loc[fuzzy_identifiers].copy() + + literal_identifiers = self.literal_search(text, df) + if len(literal_identifiers) == 0: + log.debug( + f"Found {len(fuzzy_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") + return fuzzy_identifiers + + # append any fuzzy identifiers that were not found in the literal search + remaining_fuzzy_identifiers = [ + _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] + identifiers = literal_identifiers + remaining_fuzzy_identifiers + + log.debug( + f"Found {len(identifiers)} ({len(literal_identifiers)} literal) search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") + return identifiers From 646b3beeba41ab5b4a43a77fc55e58696847ba9e Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 12:10:20 +0200 Subject: [PATCH 15/47] move searchengine files --- activity_browser/bwutils/searchengine/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/activity_browser/bwutils/searchengine/__init__.py b/activity_browser/bwutils/searchengine/__init__.py index 7a7eae9c1..a3ed1d8e1 100644 --- a/activity_browser/bwutils/searchengine/__init__.py +++ b/activity_browser/bwutils/searchengine/__init__.py @@ -1,2 +1,2 @@ -from base import SearchEngine -from metadata_search import MetaDataSearchEngine +from .base import SearchEngine +from .metadata_search import MetaDataSearchEngine From 39af7634a4ad43cd6aceec76f638989f7198dc4c Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 13:27:26 +0200 Subject: [PATCH 16/47] metadata and search size logging --- activity_browser/bwutils/metadata.py | 15 +++++++++--- activity_browser/bwutils/searchengine/base.py | 23 +++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index 2e665cdd7..b3d6a7967 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -2,11 +2,13 @@ import itertools import sqlite3 import pickle +import sys from time import time from functools import lru_cache from typing import Set from logging import getLogger +from playhouse.shortcuts import model_to_dict import pandas as pd from qtpy.QtCore import Qt, QObject, Signal, SignalInstance @@ -15,7 +17,7 @@ from bw2data.errors import UnknownObject from bw2data.backends import sqlite3_lci_db, ActivityDataset -from activity_browser.bwutils.search import MetaDataSearchEngine +from activity_browser.bwutils.searchengine import MetaDataSearchEngine from activity_browser import signals @@ -183,6 +185,7 @@ def _get_database(self, db_name: str) -> pd.DataFrame | None: def sync(self) -> None: """Deletes metadata when the project is changed.""" + t = time() log.debug("Synchronizing MetaDataStore") con = sqlite3.connect(sqlite3_lci_db._filepath) @@ -190,8 +193,14 @@ def sync(self) -> None: con.close() self.dataframe = self._parse_df(node_df) - self.init_search() # init search index + size_bytes = sys.getsizeof(self.dataframe) + if size_bytes < 1024 ** 3: + size = f"{size_bytes / (1024 ** 2):.1f} MB" + else: + size = f"{size_bytes / (1024 ** 3):.2f} GB" + log.debug(f"MetaDataStore Synchronized in {time() - t:.2f} seconds for {len(self.dataframe)} items ({size}))") + self.init_search() # init search index self.synced.emit() def _parse_df(self, raw_df: pd.DataFrame) -> pd.DataFrame: @@ -351,7 +360,7 @@ def init_search(self): "product", "reference product", "classifications", "location", "properties" # activity specific ] - MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols) + self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols) AB_metadata = MetaDataStore() diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 7f9d8158e..293ddd230 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np import re +import sys log = getLogger(__name__) @@ -128,9 +129,9 @@ def update_dict(update_me: dict, new: dict) -> dict: size_new = len(self.df) size_dif = size_new - size_old - size_msg = (f"{size_dif} changed items at {round(size_dif/(time() - t), 0)} items/sec " - f"({size_new} items currently)") if size_dif > 1 \ - else f"1 changed item ({size_new} items currently)" + size_msg = (f"{size_dif} changed items at {int(round(size_dif/(time() - t), 0))} items/sec " + f"({size_new} items ({self.size_of_index()}) currently)") if size_dif > 1 \ + else f"1 changed item ({size_new} items ({self.size_of_index()}) currently)" log.debug(f"Search index updated in {time() - t:.2f} seconds for {size_msg}.") def clean_text(self, text: str): @@ -214,6 +215,20 @@ def word_in_index(self, word: str) -> bool: f"Given word '{word}' must not contain spaces.") return word in self.word_to_identifier.keys() + def size_of_index(self): + """return the size of the search index in MB or GB.""" + s_df = sys.getsizeof(self.df) + s_i2w = sys.getsizeof(self.identifier_to_word) + s_w2i = sys.getsizeof(self.word_to_identifier) + s_w2q = sys.getsizeof(self.word_to_q_grams) + s_q2w = sys.getsizeof(self.q_gram_to_word) + size_bytes = s_df + s_i2w + s_w2i + s_w2q + s_q2w + + if size_bytes < 1024 ** 3: + return f"{size_bytes / (1024 ** 2):.1f} MB" + else: + return f"{size_bytes / (1024 ** 3):.2f} GB" + # +++ Changes to searchable data def add_identifier(self, data: pd.DataFrame) -> None: @@ -296,7 +311,7 @@ def remove_identifier(self, identifier, logging=True) -> None: if logging: log.debug(f"Search index updated in {time() - t:.2f} seconds " - f"for 1 removed item ({len(self.df)} items currently).") + f"for 1 removed item ({len(self.df)} items ({self.size_of_index()}) currently).") def change_identifier(self, identifier, data: pd.DataFrame) -> None: """Change this identifier. From fad8a06f6d5c3af2668fd7e2a35cf408c98710bf Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 16:27:11 +0200 Subject: [PATCH 17/47] - Faster results with large data and short queries - solve bracket in wrong place breaking matchfinding --- activity_browser/bwutils/searchengine/base.py | 10 ++++++---- .../bwutils/searchengine/metadata_search.py | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 293ddd230..618585991 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -257,7 +257,8 @@ def add_identifier(self, data: pd.DataFrame) -> None: if col not in data.columns: data[col] = [""] * len(data) # re-order cols, first existing, then new - new_cols = [col for col in data.columns if col not in self.columns if col not in set(df_cols)] + df_col_set = set(df_cols) + new_cols = [col for col in data.columns if col not in self.columns if col not in df_col_set] data_cols = df_cols + new_cols data = data[data_cols] # re-order new data to be in correct order @@ -285,7 +286,7 @@ def remove_identifier(self, identifier, logging=True) -> None: f"Identifier '{identifier}' does not exist in the search data, cannot remove identifier that do not exist.") # remove from df - self.df.drop(identifier, inplace=True) + self.df = self.df.drop(identifier) # find words that may need to be removed words = self.identifier_to_word[identifier] @@ -547,7 +548,7 @@ def spell_check(self, text: str) -> OrderedDict: for match, num in new.most_common(): if num == prev_num: matches.append(match) - elif num != prev_num and len(matches <= matches_max): + elif num != prev_num and len(matches) <= matches_max: matches.append(match) else: break @@ -754,8 +755,9 @@ def search(self, text) -> list: return fuzzy_identifiers # append any fuzzy identifiers that were not found in the literal search + literal_id_set = set(literal_identifiers) remaining_fuzzy_identifiers = [ - _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] + _id for _id in fuzzy_identifiers if _id not in literal_id_set] identifiers = literal_identifiers + remaining_fuzzy_identifiers log.debug( diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 01a5f93aa..43332e939 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -187,8 +187,9 @@ def search(self, text, database: Optional[str] = None) -> list: return fuzzy_identifiers # append any fuzzy identifiers that were not found in the literal search + literal_id_set = set(literal_identifiers) remaining_fuzzy_identifiers = [ - _id for _id in fuzzy_identifiers if _id not in set(literal_identifiers)] + _id for _id in fuzzy_identifiers if _id not in literal_id_set] identifiers = literal_identifiers + remaining_fuzzy_identifiers log.debug( From 1aad95b421d15c8ec8e9bd406a5cea47c802689c Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Wed, 3 Sep 2025 18:12:53 +0200 Subject: [PATCH 18/47] Base implementation of better search in ActivitiesProducts table --- activity_browser/bwutils/metadata.py | 7 ++- activity_browser/bwutils/searchengine/base.py | 4 +- .../bwutils/searchengine/metadata_search.py | 20 ++++++- .../layouts/panes/database_products.py | 56 ++++++++++++++++++- activity_browser/ui/widgets/item_model.py | 19 +++++-- activity_browser/ui/widgets/treeview.py | 5 +- 6 files changed, 100 insertions(+), 11 deletions(-) diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index b3d6a7967..7e1ff5e1f 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -5,7 +5,7 @@ import sys from time import time from functools import lru_cache -from typing import Set +from typing import Set, Optional from logging import getLogger from playhouse.shortcuts import model_to_dict @@ -362,5 +362,10 @@ def init_search(self): self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols) + def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False): + return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter) + + def search(self, query:str): + return self.search_engine.search(query) AB_metadata = MetaDataStore() diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 618585991..f01b941a7 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -628,7 +628,7 @@ def search_size_1(self, queries: list, original_words: set, orig_word_weight=5, return matched_identifiers - def fuzzy_search(self, text: str) -> list: + def fuzzy_search(self, text: str, return_counter: bool = False) -> list: """Search the dataframe, finding approximate matches and return a list of identifiers, ranked by how well each identifier matches the search text. @@ -728,6 +728,8 @@ def fuzzy_search(self, text: str) -> list: for identifiers in query_to_identifier.values(): all_identifiers += identifiers + if return_counter: + return all_identifiers # now sort on highest weights and make list type sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] return sorted_identifiers diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 43332e939..624e2365b 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -57,9 +57,20 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: return matches.iloc[:min(len(matches), 2500), :] # return at most this many results - def fuzzy_search(self, text: str) -> list: + def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True) -> list: """Overwritten for extra database specific reduction of results. """ + if len(text) == 0: + log.debug(f"Empty search, returned all items") + return self.df.index.to_list() + t = time() + + # DATABASE SPECIFIC get the set of ids that is in this database + if database is not None: + self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) + else: + self.database_ids = None + queries = self.build_queries(text) # make list of unique original words @@ -154,6 +165,11 @@ def fuzzy_search(self, text: str) -> list: for identifiers in query_to_identifier.values(): all_identifiers += identifiers + if logging: + log.debug( + f"Found {len(all_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") + if return_counter: + return all_identifiers # now sort on highest weights and make list type sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] return sorted_identifiers @@ -172,7 +188,7 @@ def search(self, text, database: Optional[str] = None) -> list: else: self.database_ids = None - fuzzy_identifiers = self.fuzzy_search(text) + fuzzy_identifiers = self.fuzzy_search(text, database=database, logging=False) if len(fuzzy_identifiers) == 0: log.debug(f"Found 0 search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") return [] diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py index 4a49851c5..4439e140a 100644 --- a/activity_browser/layouts/panes/database_products.py +++ b/activity_browser/layouts/panes/database_products.py @@ -1,5 +1,6 @@ from logging import getLogger from time import time +from collections import Counter import pandas as pd from qtpy import QtWidgets, QtCore, QtGui @@ -56,6 +57,8 @@ def __init__(self, parent, db_name: str): self.table_view = ProductView(self) self.table_view.setModel(self.model) self.model.setDataFrame(self.build_df()) + self.model.has_external_search = True + self.model.external_col_name = db_name self.search = widgets.ABLineEdit(self) self.search.setMaximumHeight(30) @@ -81,7 +84,11 @@ def connect_signals(self): signals.database.deleted.connect(self.on_database_deleted) self.table_view.filtered.connect(self.search_error) - self.search.textChangedDebounce.connect(self.table_view.setAllFilter) + self.search.textChangedDebounce.connect(self.set_queries) + + def set_queries(self, query: str) -> None: + self.model.set_external_query(query) + self.table_view.setAllFilter(query) def saveState(self): """ @@ -360,6 +367,27 @@ def selected_activities(self) -> [tuple]: items = [i.internalPointer() for i in self.selectedIndexes() if isinstance(i.internalPointer(), ProductItem)] return list({item["activity_key"] for item in items if item["activity_key"] is not None}) + def buildQuery(self) -> str: + queries = ["(index == index)"] + + # query for the column filters + for col in list(self.columnFilters): + if col not in self.model().columns(): + del self.columnFilters[col] + + for col, query in self.columnFilters.items(): + q = f"({col}.astype('str').str.contains('{self.format_query(query)}'))" + queries.append(q) + + # query for the all filter + if self.allFilter.startswith('='): + queries.append(f"({self.allFilter[1:]})") + + query = " & ".join(queries) + log.debug(f"{self.__class__.__name__} built query: {query}") + + return query + class ProductItem(ui.widgets.ABDataItem): """ @@ -454,3 +482,29 @@ def values_from_indices(key: str, indices: list[QtCore.QModelIndex]): continue values.append(item[key]) return values + + def external_search(self, query): + results = AB_metadata.db_search(query, database=self.external_col_name, return_counter=True) + + # extract a dict with 'key' as key and 'id' as values from the metadata + result_ids = set(results.keys()) + # extract df with only result IDs and columns 'id' and 'key' + df = AB_metadata.dataframe[AB_metadata.dataframe["id"].isin(result_ids)].loc[:, ["id", "key"]] + df = df.set_index("key", drop=True) + translate_dict = df.to_dict()["id"] + + # convert the metadata id scores to row id scores + row_scores = Counter() + df = self.dataframe.copy() + act_idx = set(df[df["activity_key"].isin(translate_dict.keys())].index.to_list()) + prd_idx = set(df[df["product_key"].isin(translate_dict.keys())].index.to_list()) + indices = act_idx | prd_idx # combine the two sets ('|' is a set union) + # iterate over the indices + for index in indices: + act_score = results.get(translate_dict.get(df.loc[index, "activity_key"]), 0) + prd_score = results.get(translate_dict.get(df.loc[index, "product_key"]), 0) + row_scores[index] = act_score + prd_score + + # finally only return the indices + sorted_indices = [identifier[0] for identifier in row_scores.most_common()] + return sorted_indices diff --git a/activity_browser/ui/widgets/item_model.py b/activity_browser/ui/widgets/item_model.py index 62c7b040a..696a6f9dc 100644 --- a/activity_browser/ui/widgets/item_model.py +++ b/activity_browser/ui/widgets/item_model.py @@ -26,6 +26,9 @@ def __init__(self, parent=None, dataframe=None): self.sort_column: int = 0 # column that is currently sorted self.sort_order: Qt.SortOrder = Qt.SortOrder.AscendingOrder self._query = "" # Pandas query currently applied to the dataframe + self.has_external_search = False + self._external_query = "" + self.external_col_name = "" self.setDataFrame(self.dataframe) @@ -192,7 +195,11 @@ def endResetModel(self): # apply any queries to the dataframe if q := self.query(): - df = self.dataframe.query(q).reset_index(drop=True).copy() + df = self.dataframe.copy() + if self.has_external_search and self._external_query != "": + indices = self.external_search(self._external_query) + df = df.loc[indices] + df = df.query(q).reset_index(drop=True) else: df = self.dataframe.copy() @@ -271,11 +278,15 @@ def setQuery(self, query: str): self._query = query self.endResetModel() + def set_external_query(self, query: str): + if not query.startswith("="): + self._external_query = query + + def external_search(self, query): + NotImplementedError + def hasChildren(self, parent: QtCore.QModelIndex): item = parent.internalPointer() if isinstance(item, ABAbstractItem): return item.has_children() return super().hasChildren(parent) - - - diff --git a/activity_browser/ui/widgets/treeview.py b/activity_browser/ui/widgets/treeview.py index 89cb49aa0..36222b1bb 100644 --- a/activity_browser/ui/widgets/treeview.py +++ b/activity_browser/ui/widgets/treeview.py @@ -6,6 +6,7 @@ from qtpy.QtCore import Qt from .item_model import ABItemModel +from activity_browser.ui import widgets log = getLogger(__name__) @@ -25,11 +26,11 @@ def __init__(self, pos: QtCore.QPoint, view: "ABTreeView"): col_index = view.columnAt(pos.x()) col_name = model.columns()[col_index] - search_box = QtWidgets.QLineEdit(self) + search_box = widgets.ABLineEdit(self) search_box.setText(view.columnFilters.get(col_name, "")) search_box.setPlaceholderText("Search") search_box.selectAll() - search_box.textChanged.connect(lambda query: view.setColumnFilter(col_name, query)) + search_box.textChangedDebounce.connect(lambda query: view.setColumnFilter(col_name, query)) widget_action = QtWidgets.QWidgetAction(self) widget_action.setDefaultWidget(search_box) self.addAction(widget_action) From 5b0a965670edc6951efd4a464f02c0aa199073bf Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 4 Sep 2025 08:21:48 +0200 Subject: [PATCH 19/47] check all newly added items are unique --- activity_browser/bwutils/searchengine/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index f01b941a7..d975726c2 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -244,13 +244,18 @@ def add_identifier(self, data: pd.DataFrame) -> None: raise Exception( f"Identifier column '{self.identifier_name}' not in new data, impossible to add data without identifier") - # make sure we the identifier does not yet exist + # make sure we the new identifiers do not yet exist existing_ids = set(self.df.index.to_list()) for identifier in data[self.identifier_name]: if identifier in existing_ids: raise Exception( f"Identifier '{identifier}' is already in use, use a different identifier or use the change_identifier function.") + # make sure all new identifiers given are unique + if data[self.identifier_name].nunique() != data.shape[0]: + raise KeyError( + f"Identifier column {self.identifier_name} must only contain unique values. Found {data[self.identifier_name].nunique()} unique values for length {data.shape[0]}") + df_cols = self.columns # add cols to new data that are missing for col in df_cols: From 9ee34503703a83d2d50bd6e5024b74009a1c7c09 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 4 Sep 2025 16:17:49 +0200 Subject: [PATCH 20/47] dont allow sorting of table when search engine in use --- activity_browser/ui/widgets/item_model.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/activity_browser/ui/widgets/item_model.py b/activity_browser/ui/widgets/item_model.py index 09c9d7fc3..3a2fd6d74 100644 --- a/activity_browser/ui/widgets/item_model.py +++ b/activity_browser/ui/widgets/item_model.py @@ -203,13 +203,14 @@ def endResetModel(self): else: df = self.dataframe.copy() - if not self.sort_column > len(self.columns()) - 1: - # apply the sorting - df.sort_values( - by=self.columns()[self.sort_column], - ascending=(self.sort_order == Qt.SortOrder.AscendingOrder), - inplace=True, ignore_index=True - ) + if not (self.has_external_search and self._external_query != ""): + if not self.sort_column > len(self.columns()) - 1: + # apply the sorting + df.sort_values( + by=self.columns()[self.sort_column], + ascending=(self.sort_order == Qt.SortOrder.AscendingOrder), + inplace=True, ignore_index=True + ) # rebuild the ABItem tree self.root = self.branchItemClass("root") @@ -281,6 +282,8 @@ def setQuery(self, query: str): def set_external_query(self, query: str): if not query.startswith("="): self._external_query = query + else: + self._external_query = "" def external_search(self, query): NotImplementedError From e92d298dce86a11bad237cae3821a99bdf3beac7 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 4 Sep 2025 16:19:32 +0200 Subject: [PATCH 21/47] resolve search bug with multiple typos not working --- activity_browser/bwutils/searchengine/base.py | 25 ++++++++++++------- .../bwutils/searchengine/metadata_search.py | 24 ++++++++++++------ tests/test_search.py | 6 +++++ 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index d975726c2..75b81424f 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -43,8 +43,8 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l log.debug(f"SearchEngine initializing for {len(df)} items") # compile regex patterns for cleaning - self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"]") # for replacing with empty string - self.SPACE_PATTERN = re.compile(r"[-−:;]") # for replacing with space + self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"…]") # for replacing with empty string + self.SPACE_PATTERN = re.compile(r"[-−:;/+]") # for replacing with space self.ONE_SPACE_PATTERN = re.compile(r"\s+") # for replacing multiple white space with 1 space self.q = 2 # character length of q grams @@ -471,9 +471,10 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: max_q = max(matches["matches"]) # this has the most matching q-grams # determine how many results we want to keep based on how good our results are - min_q = max(max_q * 0.32, # have at least a third of q-grams of best match or... - max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? - 1)) # okay just do 1 q-gram if there are no more in the word + min_q = min(max(max_q * 0.32, # have at least a third of q-grams of best match or... + max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? + 1)), # okay just do 1 q-gram if there are no more in the word + max_q) # never have min_q be over max_q matches = matches[matches["matches"] >= min_q] matches = matches.sort_values(by="matches", ascending=False) @@ -650,6 +651,7 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list: Finally, all found identifiers are sorted on their weight and returned. """ + text = text.strip() queries = self.build_queries(text) @@ -684,13 +686,16 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list: # we already did these above continue for query in query_set: - # get the intersection of all identifiers # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query # this ensures we only ever search data where ALL items occur to substantially reduce search-space # finally, make this a Counter (with each item=1) so we can properly weigh things later - query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if - query_to_identifier.get(q_word, False)]) + query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if + query_to_identifier.get(q_word, False)] + if len(query_id_sets) > 0: + query_identifier_set = set.intersection(*query_id_sets) + else: + query_identifier_set = set() if len(query_identifier_set) == 0: # there is no match for this combination of query words, skip break @@ -701,7 +706,8 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list: for identifier in query_identifier_set: weight = 0 for query_word in query: - weight += query_to_identifier[query_word][identifier] + # if the query_word and identifier combination exist get score, otherwise 0 + weight += query_to_identifier.get(query_word, {}).get(identifier, 0) query_identifiers[identifier] = weight @@ -742,6 +748,7 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list: def search(self, text) -> list: """Search the dataframe on this text, return a sorted list of identifiers.""" t = time() + text = text.strip() if len(text) == 0: log.debug(f"Empty search, returned all items") diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 624e2365b..c09c28aa8 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -47,9 +47,10 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: max_q = max(matches["matches"]) # this has the most matching q-grams # determine how many results we want to keep based on how good our results are - min_q = max(max_q * 0.32, # have at least a third of q-grams of best match or... - max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? - 1)) # okay just do 1 q-gram if there are no more in the word + min_q = min(max(max_q * 0.32, # have at least a third of q-grams of best match or... + max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? + 1)), # okay just do 1 q-gram if there are no more in the word + max_q) # never have min_q be over max_q matches = matches[matches["matches"] >= min_q] matches = matches.sort_values(by="matches", ascending=False) @@ -60,10 +61,12 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True) -> list: """Overwritten for extra database specific reduction of results. """ + t = time() + text = text.strip() + if len(text) == 0: log.debug(f"Empty search, returned all items") return self.df.index.to_list() - t = time() # DATABASE SPECIFIC get the set of ids that is in this database if database is not None: @@ -116,13 +119,16 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter # we already did these above continue for query in query_set: - # get the intersection of all identifiers # meaning, a set of identifiers that occur in ALL sets of len(1) for the individual words in the query # this ensures we only ever search data where ALL items occur to substantially reduce search-space # finally, make this a Counter (with each item=1) so we can properly weigh things later - query_identifier_set = set.intersection(*[set(query_to_identifier.get(q_word)) for q_word in query if - query_to_identifier.get(q_word, False)]) + query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if + query_to_identifier.get(q_word, False)] + if len(query_id_sets) > 0: + query_identifier_set = set.intersection(*query_id_sets) + else: + query_identifier_set = set() if len(query_identifier_set) == 0: # there is no match for this combination of query words, skip break @@ -133,7 +139,8 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter for identifier in query_identifier_set: weight = 0 for query_word in query: - weight += query_to_identifier[query_word][identifier] + # if the query_word and identifier combination exist get score, otherwise 0 + weight += query_to_identifier.get(query_word, {}).get(identifier, 0) query_identifiers[identifier] = weight @@ -177,6 +184,7 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter def search(self, text, database: Optional[str] = None) -> list: """Search the dataframe on this text, return a sorted list of identifiers.""" t = time() + text = text.strip() if len(text) == 0: log.debug(f"Empty search, returned all items") diff --git a/tests/test_search.py b/tests/test_search.py index 2bb038124..6d63c14ee 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -127,6 +127,12 @@ def test_search_base(): assert se.search("coal") == ["a", "h", "c", "b", "d", "g", "f"] # do search on other term assert se.search("coal production") == ["a", "c", "b", "d", "h", "f", "g"] + # do search on typo + assert se.search("cola") == ["a", "c", "h", "b", "d", "f", "g"] + # do search on longer typo + assert se.search("cola production") == ["c", "a", "b", "d", "h", "f", "g"] + # do search on something we will definitely not find + assert se.search("dontFindThis") == [] # init search class with 1 col searchable se = SearchEngine(df, identifier_name="id", searchable_columns=["col2"]) From 83ae1621f6329679c190ff7b97beab0eb1600008 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 4 Sep 2025 18:01:59 +0200 Subject: [PATCH 22/47] First version of autocomplete --- activity_browser/bwutils/metadata.py | 5 ++ .../bwutils/searchengine/metadata_search.py | 72 +++++++++++++++++++ .../layouts/panes/database_products.py | 3 +- activity_browser/ui/widgets/__init__.py | 2 +- activity_browser/ui/widgets/line_edit.py | 36 +++++++++- 5 files changed, 115 insertions(+), 3 deletions(-) diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index 7e1ff5e1f..70ab0606f 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -368,4 +368,9 @@ def db_search(self, query:str, database: Optional[str] = None, return_counter: b def search(self, query:str): return self.search_engine.search(query) + def auto_complete(self, word:str, database: Optional[str] = None): + word = self.search_engine.clean_text(word) + completions = self.search_engine.auto_complete(word, database) + return completions + AB_metadata = MetaDataStore() diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index c09c28aa8..ef6162723 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -12,6 +12,78 @@ class MetaDataSearchEngine(SearchEngine): + + def auto_complete(self, text: str, database) -> OrderedDict: + """Based on spellchecker, make more useful for autocompletions + """ + if database is not None: + self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) + else: + self.database_ids = None + + count_occurence = lambda x: sum(self.word_to_identifier[x].values()) # count occurences of a word + + word_results = OrderedDict() + + matches_min = 3 # ideally we have at least this many alternatives + matches_max = 10 # ideally don't much more than this many matches + always_accept_this = 1 # values of this edit distance or lower always accepted + never_accept_this = 4 # values this edit distance or over always rejected + + # make list of unique words + words = OrderedDict() + for word in text.split(" "): + words[word] = False + words = words.keys() + + words = [self.clean_text(word) for word in words] + + for word in words: + # first, find possible matches quickly + q_grams = self.text_to_positional_q_gram(word) + possible_matches = self.find_q_gram_matches(set(q_grams)) + + matches = [] + first_matches = Counter() + other_matches = {} + + # now, refine with edit distance + for row in possible_matches.itertuples(): + + edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this) + + if edit_distance == 0: + continue # we are looking for alternatives only, not the exact word + elif edit_distance <= always_accept_this: + first_matches[row[1]] = count_occurence(row[1]) + elif edit_distance < never_accept_this: + if not other_matches.get(edit_distance): + other_matches[edit_distance] = Counter() + other_matches[edit_distance][row[1]] = count_occurence(row[1]) + else: + continue + + # add matches in correct order: + for match, _ in first_matches.most_common(): + matches.append(match) + # if we have fewer matches than goal, add more 'less good' matches + if len(matches) < matches_min: + for i in range(always_accept_this + 1, never_accept_this): + # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives + if new := other_matches.get(i): + prev_num = 10e100 + for match, num in new.most_common(): + if num == prev_num: + matches.append(match) + elif num != prev_num and len(matches) <= matches_max: + matches.append(match) + else: + break + prev_num = num + + word_results[word] = matches + return word_results + def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: """Overwritten for extra database specific reduction of results. """ diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py index 4439e140a..48de92aa1 100644 --- a/activity_browser/layouts/panes/database_products.py +++ b/activity_browser/layouts/panes/database_products.py @@ -60,7 +60,8 @@ def __init__(self, parent, db_name: str): self.model.has_external_search = True self.model.external_col_name = db_name - self.search = widgets.ABLineEdit(self) + self.search = widgets.MetaDataAutoCompleteLineEdit(self) + self.search.database_name = db_name self.search.setMaximumHeight(30) self.search.setPlaceholderText("Quick Search") diff --git a/activity_browser/ui/widgets/__init__.py b/activity_browser/ui/widgets/__init__.py index f8c0c439b..333811439 100644 --- a/activity_browser/ui/widgets/__init__.py +++ b/activity_browser/ui/widgets/__init__.py @@ -2,7 +2,7 @@ from .comparison_switch import SwitchComboBox from .cutoff_menu import CutoffMenu from .line_edit import (ABLineEdit, SignalledComboEdit, SignalledLineEdit, - SignalledPlainTextEdit) + SignalledPlainTextEdit, MetaDataAutoCompleteLineEdit) from .treeview import ABTreeView from .item_model import ABItemModel from .item import ABAbstractItem, ABBranchItem, ABDataItem diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 655d269d5..d78c2557b 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -1,8 +1,10 @@ from qtpy import QtWidgets -from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance +from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance, QStringListModel, Qt from qtpy.QtGui import QTextFormat from qtpy.QtWidgets import QCompleter +from activity_browser.bwutils import AB_metadata + class ABLineEdit(QtWidgets.QLineEdit): textChangedDebounce: SignalInstance = Signal(str) @@ -120,3 +122,35 @@ def __init__(self, items: list[str], parent=None): super().__init__(parent=parent) completer = QCompleter(items, self) self.setCompleter(completer) + +class MetaDataAutoCompleteLineEdit(ABLineEdit): + """Line Edit with MetaDataStore completer attached""" + def __init__(self, parent=None): + super().__init__(parent=parent) + self.database_name = "" + + self.textChanged.connect(self._set_items) + + self.model = QStringListModel() + self.completer = QCompleter(self.model) + self.completer.setPopup(self.completer.popup()) + self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) + self.setCompleter(self.completer) + + def _set_items(self): + text = self.text() + + words = text.split(" ") + if len(words) == 0: + self.model.setStringList([]) + return + + alternatives = AB_metadata.auto_complete(words[-1], database=self.database_name) + alternatives = alternatives[words[-1]][:5] # allow for max n autocompletes + print(alternatives) + + items = [] + for alternative in alternatives: + line = " ".join(words[:-1] + [alternative]) + items.append(line) + self.model.setStringList(items) From 2b61e161329363e4d1ccf0b44d64c0eadc506a01 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 4 Sep 2025 21:01:33 +0200 Subject: [PATCH 23/47] cache database identifiers for faster results + much faster autocomplete --- .../bwutils/searchengine/metadata_search.py | 132 ++++++++++-------- 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index ef6162723..ce8483373 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -12,76 +12,92 @@ class MetaDataSearchEngine(SearchEngine): + def database_id_manager(self, database): + if not hasattr(self, "all_database_ids"): + self.all_database_ids = {} - def auto_complete(self, text: str, database) -> OrderedDict: - """Based on spellchecker, make more useful for autocompletions - """ - if database is not None: + if database_ids := self.all_database_ids.get(database): + self.database_ids = database_ids + elif database is not None: self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) + self.all_database_ids[database] = self.database_ids else: self.database_ids = None + def reset_database_id_manager(self): + del self.all_database_ids + del self.database_ids + + def add_identifier(self, data: pd.DataFrame) -> None: + super().add_identifier(data) + self.reset_database_id_manager() + + def remove_identifier(self, identifier, logging=True) -> None: + super().remove_identifier(identifier, logging=logging) + self.reset_database_id_manager() + + def change_identifier(self, identifier, data: pd.DataFrame) -> None: + super().change_identifier(identifier, data) + self.reset_database_id_manager() + + def auto_complete(self, word: str, database) -> OrderedDict: + """Based on spellchecker, make more useful for autocompletions + """ + self.database_id_manager(database) + count_occurence = lambda x: sum(self.word_to_identifier[x].values()) # count occurences of a word word_results = OrderedDict() - matches_min = 3 # ideally we have at least this many alternatives - matches_max = 10 # ideally don't much more than this many matches - always_accept_this = 1 # values of this edit distance or lower always accepted + matches_min = 2 # ideally we have at least this many alternatives + matches_max = 4 # ideally don't much more than this many matches never_accept_this = 4 # values this edit distance or over always rejected - # make list of unique words - words = OrderedDict() - for word in text.split(" "): - words[word] = False - words = words.keys() - - words = [self.clean_text(word) for word in words] + # first, find possible matches quickly + q_grams = self.text_to_positional_q_gram(word) + possible_matches = self.find_q_gram_matches(set(q_grams)) - for word in words: - # first, find possible matches quickly - q_grams = self.text_to_positional_q_gram(word) - possible_matches = self.find_q_gram_matches(set(q_grams)) + matches = [] + first_matches = Counter() + other_matches = {} - matches = [] - first_matches = Counter() - other_matches = {} + # now, refine with edit distance + for row in possible_matches.itertuples(): - # now, refine with edit distance - for row in possible_matches.itertuples(): + if len(word) > len(row[1]) or word == row[1]: + continue + test_word = row[1][:len(word)] - edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this) + edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, len(word))) - if edit_distance == 0: - continue # we are looking for alternatives only, not the exact word - elif edit_distance <= always_accept_this: - first_matches[row[1]] = count_occurence(row[1]) - elif edit_distance < never_accept_this: - if not other_matches.get(edit_distance): - other_matches[edit_distance] = Counter() - other_matches[edit_distance][row[1]] = count_occurence(row[1]) - else: - continue - - # add matches in correct order: - for match, _ in first_matches.most_common(): - matches.append(match) - # if we have fewer matches than goal, add more 'less good' matches - if len(matches) < matches_min: - for i in range(always_accept_this + 1, never_accept_this): - # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives - if new := other_matches.get(i): - prev_num = 10e100 - for match, num in new.most_common(): - if num == prev_num: - matches.append(match) - elif num != prev_num and len(matches) <= matches_max: - matches.append(match) - else: - break - prev_num = num + if edit_distance == 0: + first_matches[row[1]] = count_occurence(row[1]) + elif edit_distance < never_accept_this: + if not other_matches.get(edit_distance): + other_matches[edit_distance] = Counter() + other_matches[edit_distance][row[1]] = count_occurence(row[1]) + else: + continue - word_results[word] = matches + # add matches in correct order: + for match, _ in first_matches.most_common(): + matches.append(match) + # if we have fewer matches than goal, add more 'less good' matches + if len(matches) < matches_min: + for i in range(1, never_accept_this): + # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives + if new := other_matches.get(i): + prev_num = 10e100 + for match, num in new.most_common(): + if num == prev_num: + matches.append(match) + elif num != prev_num and len(matches) <= matches_max: + matches.append(match) + else: + break + prev_num = num + + word_results[word] = matches return word_results def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: @@ -141,10 +157,7 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter return self.df.index.to_list() # DATABASE SPECIFIC get the set of ids that is in this database - if database is not None: - self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) - else: - self.database_ids = None + self.database_id_manager(database) queries = self.build_queries(text) @@ -263,10 +276,7 @@ def search(self, text, database: Optional[str] = None) -> list: return self.df.index.to_list() # get the set of ids that is in this database - if database is not None: - self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) - else: - self.database_ids = None + self.database_id_manager(database) fuzzy_identifiers = self.fuzzy_search(text, database=database, logging=False) if len(fuzzy_identifiers) == 0: From 64bbcd1082d28c0c8b0a4616043a13c55dfa58f9 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Thu, 4 Sep 2025 22:07:54 +0200 Subject: [PATCH 24/47] Implement proper autocomplete popup --- .../bwutils/searchengine/metadata_search.py | 14 +++--- activity_browser/ui/widgets/line_edit.py | 47 ++++++++++++++++--- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index ce8483373..5bb01b2a4 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -40,15 +40,16 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None: super().change_identifier(identifier, data) self.reset_database_id_manager() - def auto_complete(self, word: str, database) -> OrderedDict: + def auto_complete(self, word: str, database) -> list: """Based on spellchecker, make more useful for autocompletions """ + if len(word) <= 2: + return [] + self.database_id_manager(database) count_occurence = lambda x: sum(self.word_to_identifier[x].values()) # count occurences of a word - word_results = OrderedDict() - matches_min = 2 # ideally we have at least this many alternatives matches_max = 4 # ideally don't much more than this many matches never_accept_this = 4 # values this edit distance or over always rejected @@ -63,13 +64,11 @@ def auto_complete(self, word: str, database) -> OrderedDict: # now, refine with edit distance for row in possible_matches.itertuples(): - if len(word) > len(row[1]) or word == row[1]: continue - test_word = row[1][:len(word)] + test_word = row[1][:len(word)] # only find edit distance of first part of word edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, len(word))) - if edit_distance == 0: first_matches[row[1]] = count_occurence(row[1]) elif edit_distance < never_accept_this: @@ -97,8 +96,7 @@ def auto_complete(self, word: str, database) -> OrderedDict: break prev_num = num - word_results[word] = matches - return word_results + return matches def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: """Overwritten for extra database specific reduction of results. diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index d78c2557b..6e85ab11b 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -123,34 +123,69 @@ def __init__(self, items: list[str], parent=None): completer = QCompleter(items, self) self.setCompleter(completer) + class MetaDataAutoCompleteLineEdit(ABLineEdit): """Line Edit with MetaDataStore completer attached""" + + textChangedAutoCompleteDebounce: SignalInstance = Signal() + _debounce_autocomplete_ms = 75 + def __init__(self, parent=None): super().__init__(parent=parent) + + # debounce timer settings + self._debounce_autocomplete_timer = QTimer(self, singleShot=True) + # self.textChanged.connect(self._set_autocomplete_debounce) + self._debounce_autocomplete_timer.timeout.connect(self._emit_autocomplete_debounce) + self.database_name = "" - self.textChanged.connect(self._set_items) + # trigger autocomplete list update + self.textChangedAutoCompleteDebounce.connect(self._set_items) + # autocompleter settings self.model = QStringListModel() self.completer = QCompleter(self.model) - self.completer.setPopup(self.completer.popup()) - self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) + self.popup = self.completer.popup() + self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) + self.popup.setMaximumHeight(20) + self.completer.setPopup(self.popup) + self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) # allow all items in popup list + self.setCompleter(self.completer) def _set_items(self): text = self.text() - + self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()) words = text.split(" ") if len(words) == 0: self.model.setStringList([]) return alternatives = AB_metadata.auto_complete(words[-1], database=self.database_name) - alternatives = alternatives[words[-1]][:5] # allow for max n autocompletes - print(alternatives) + alternatives = alternatives[:5] # allow for max n autocompletes + print(text, alternatives) items = [] for alternative in alternatives: line = " ".join(words[:-1] + [alternative]) items.append(line) self.model.setStringList(items) + self.completer.complete() + + def _set_autocomplete_debounce(self): + self._debounce_autocomplete_timer.setInterval(self._debounce_autocomplete_ms) + self._debounce_autocomplete_timer.start() + + def _emit_autocomplete_debounce(self): + self.textChangedAutoCompleteDebounce.emit() + + def keyPressEvent(self, event): + if event.key() == Qt.Key_Escape: + if self.completer.popup().isVisible(): + self.completer.popup().hide() + event.accept() + return + super().keyPressEvent(event) + if event.text().strip(): + QTimer.singleShot(0, self._set_autocomplete_debounce) From e76f57c2b417dc9659a8d047a3f6fc65fd40645f Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 5 Sep 2025 13:39:28 +0200 Subject: [PATCH 25/47] suggestions for currently edited word instead of last word + better autocomplete menu behaviour --- activity_browser/ui/widgets/line_edit.py | 76 ++++++++++-------------- 1 file changed, 31 insertions(+), 45 deletions(-) diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 6e85ab11b..244fa59c5 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -127,65 +127,51 @@ def __init__(self, items: list[str], parent=None): class MetaDataAutoCompleteLineEdit(ABLineEdit): """Line Edit with MetaDataStore completer attached""" - textChangedAutoCompleteDebounce: SignalInstance = Signal() - _debounce_autocomplete_ms = 75 - def __init__(self, parent=None): super().__init__(parent=parent) - - # debounce timer settings - self._debounce_autocomplete_timer = QTimer(self, singleShot=True) - # self.textChanged.connect(self._set_autocomplete_debounce) - self._debounce_autocomplete_timer.timeout.connect(self._emit_autocomplete_debounce) - self.database_name = "" - # trigger autocomplete list update - self.textChangedAutoCompleteDebounce.connect(self._set_items) - # autocompleter settings self.model = QStringListModel() self.completer = QCompleter(self.model) self.popup = self.completer.popup() self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) - self.popup.setMaximumHeight(20) self.completer.setPopup(self.popup) - self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) # allow all items in popup list - + # allow all items in popup list + self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) self.setCompleter(self.completer) - def _set_items(self): - text = self.text() - self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()) - words = text.split(" ") - if len(words) == 0: + # connect textEdited, this only triggers on user input, not Completer input + self.textEdited.connect(self._set_items) + + def _set_items(self, text=None): + if text is None: + text = self.text() + + # find the start and end of the word under the cursor + cursor_pos = self.cursorPosition() + start = cursor_pos + while start > 0 and text[start - 1] != " ": + start -= 1 + end = cursor_pos + while end < len(text) and text[end] != " ": + end += 1 + current_word = text[start:end] + if not current_word: self.model.setStringList([]) return - alternatives = AB_metadata.auto_complete(words[-1], database=self.database_name) - alternatives = alternatives[:5] # allow for max n autocompletes - print(text, alternatives) - + # get suggestions for the current word + alternatives = AB_metadata.auto_complete(current_word, database=self.database_name) + alternatives = alternatives[:6] # at most 6, though we should get ~3 usually + # replace the current word with each alternative items = [] - for alternative in alternatives: - line = " ".join(words[:-1] + [alternative]) - items.append(line) + for alt in alternatives: + new_text = text[:start] + alt + text[end:] + items.append(new_text) + print(text, items) + self.model.setStringList(items) - self.completer.complete() - - def _set_autocomplete_debounce(self): - self._debounce_autocomplete_timer.setInterval(self._debounce_autocomplete_ms) - self._debounce_autocomplete_timer.start() - - def _emit_autocomplete_debounce(self): - self.textChangedAutoCompleteDebounce.emit() - - def keyPressEvent(self, event): - if event.key() == Qt.Key_Escape: - if self.completer.popup().isVisible(): - self.completer.popup().hide() - event.accept() - return - super().keyPressEvent(event) - if event.text().strip(): - QTimer.singleShot(0, self._set_autocomplete_debounce) + # set correct height now that we have data + self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()) + From eeed99277a23df1fb2e386f4f579c82edc779caf Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 5 Sep 2025 17:24:17 +0200 Subject: [PATCH 26/47] Improve text cleaning regex + autocomplete deals better with key hashes + manage popup height better --- activity_browser/bwutils/searchengine/base.py | 12 +++++++----- .../bwutils/searchengine/metadata_search.py | 13 +++++++++---- activity_browser/ui/widgets/line_edit.py | 6 +++++- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 75b81424f..1cc8235ee 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -43,9 +43,9 @@ def __init__(self, df: pd.DataFrame, identifier_name: str, searchable_columns: l log.debug(f"SearchEngine initializing for {len(df)} items") # compile regex patterns for cleaning - self.SUB_PATTERN = re.compile(r"[,\(\)\[\]'\"…]") # for replacing with empty string - self.SPACE_PATTERN = re.compile(r"[-−:;/+]") # for replacing with space - self.ONE_SPACE_PATTERN = re.compile(r"\s+") # for replacing multiple white space with 1 space + self.SUB_END_PATTERN = re.compile(r"[,.\"'`)\[\]}\\/\-−_:;+…]+(?=\s|$)") # remove these from end of word + self.SUB_START_PATTERN = re.compile(r"(?:^|\s)[,.\"'`(\[{\\/\-−_:;+]+") # remove these from start of word + self.ONE_SPACE_PATTERN = re.compile(r"\s+") # remove these multiple whitespaces self.q = 2 # character length of q grams self.base_weight = 10 # base weighting for sorting results @@ -136,8 +136,10 @@ def update_dict(update_me: dict, new: dict) -> dict: def clean_text(self, text: str): """Clean a string so it doesn't contain weird characters or multiple spaces etc.""" - text = self.SUB_PATTERN.sub("", text.lower()) - text = self.SPACE_PATTERN.sub(" ", text) + text = text.lower() + text = self.SUB_END_PATTERN.sub("", text) + text = self.SUB_START_PATTERN.sub(" ", text) + text = self.ONE_SPACE_PATTERN.sub(" ", text).strip() return text diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 5bb01b2a4..33bdc34e8 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -43,7 +43,7 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None: def auto_complete(self, word: str, database) -> list: """Based on spellchecker, make more useful for autocompletions """ - if len(word) <= 2: + if len(word) <= 1: return [] self.database_id_manager(database) @@ -52,7 +52,7 @@ def auto_complete(self, word: str, database) -> list: matches_min = 2 # ideally we have at least this many alternatives matches_max = 4 # ideally don't much more than this many matches - never_accept_this = 4 # values this edit distance or over always rejected + never_accept_this = 5 # values this edit distance or over always rejected # first, find possible matches quickly q_grams = self.text_to_positional_q_gram(word) @@ -61,6 +61,7 @@ def auto_complete(self, word: str, database) -> list: matches = [] first_matches = Counter() other_matches = {} + probably_keys = [] # if we suspect it's a key hash, dump it at the end of the list # now, refine with edit distance for row in possible_matches.itertuples(): @@ -68,8 +69,11 @@ def auto_complete(self, word: str, database) -> list: continue test_word = row[1][:len(word)] # only find edit distance of first part of word - edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, len(word))) - if edit_distance == 0: + edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, (len(word) * (2 / 3)))) + if len(row[1]) == 32 and edit_distance < never_accept_this: + # dump any items that are likely to be keys at the end of the list + probably_keys.append(row[1]) + elif edit_distance == 0: first_matches[row[1]] = count_occurence(row[1]) elif edit_distance < never_accept_this: if not other_matches.get(edit_distance): @@ -96,6 +100,7 @@ def auto_complete(self, word: str, database) -> list: break prev_num = num + matches = matches + probably_keys return matches def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 244fa59c5..9545c5943 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -173,5 +173,9 @@ def _set_items(self, text=None): self.model.setStringList(items) # set correct height now that we have data - self.popup.setMaximumHeight(self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth()) + max_height = max( + 20, + self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth() + ) + self.popup.setMaximumHeight(max_height) From bba71c7c0075870ff86be321dc1de32a01fa721c Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 5 Sep 2025 17:59:27 +0200 Subject: [PATCH 27/47] better key hash sorting --- activity_browser/bwutils/searchengine/base.py | 4 +--- .../bwutils/searchengine/metadata_search.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 1cc8235ee..fcee9e4cd 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -524,7 +524,6 @@ def spell_check(self, text: str) -> OrderedDict: q_grams = self.text_to_positional_q_gram(word) possible_matches = self.find_q_gram_matches(set(q_grams)) - matches = [] first_matches = Counter() other_matches = {} @@ -545,8 +544,7 @@ def spell_check(self, text: str) -> OrderedDict: continue # add matches in correct order: - for match, _ in first_matches.most_common(): - matches.append(match) + matches = [match for match, _ in first_matches.most_common()] # if we have fewer matches than goal, add more 'less good' matches if len(matches) < matches_min: for i in range(always_accept_this + 1, never_accept_this): diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 33bdc34e8..43ae1d051 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -40,7 +40,7 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None: super().change_identifier(identifier, data) self.reset_database_id_manager() - def auto_complete(self, word: str, database) -> list: + def auto_complete(self, word: str, database: Optional[str] = None) -> list: """Based on spellchecker, make more useful for autocompletions """ if len(word) <= 1: @@ -53,15 +53,16 @@ def auto_complete(self, word: str, database) -> list: matches_min = 2 # ideally we have at least this many alternatives matches_max = 4 # ideally don't much more than this many matches never_accept_this = 5 # values this edit distance or over always rejected + # or max 2/3 of len(word) if less than never_accept_this + never_accept_this = int(round(min(never_accept_this, max(1, len(word) * (2 / 3))), 0)) # first, find possible matches quickly q_grams = self.text_to_positional_q_gram(word) possible_matches = self.find_q_gram_matches(set(q_grams)) - matches = [] first_matches = Counter() other_matches = {} - probably_keys = [] # if we suspect it's a key hash, dump it at the end of the list + probably_keys = Counter() # if we suspect it's a key hash, dump it at the end of the list # now, refine with edit distance for row in possible_matches.itertuples(): @@ -69,10 +70,9 @@ def auto_complete(self, word: str, database) -> list: continue test_word = row[1][:len(word)] # only find edit distance of first part of word - edit_distance = self.osa_distance(word, test_word, cutoff=min(never_accept_this, (len(word) * (2 / 3)))) - if len(row[1]) == 32 and edit_distance < never_accept_this: - # dump any items that are likely to be keys at the end of the list - probably_keys.append(row[1]) + edit_distance = self.osa_distance(word, test_word, cutoff=never_accept_this) + if len(row[1]) == 32 and edit_distance <= 1: + probably_keys[row[1]] = 100 - edit_distance # keys need to be sorted on edit distance, not on occurence elif edit_distance == 0: first_matches[row[1]] = count_occurence(row[1]) elif edit_distance < never_accept_this: @@ -83,8 +83,7 @@ def auto_complete(self, word: str, database) -> list: continue # add matches in correct order: - for match, _ in first_matches.most_common(): - matches.append(match) + matches = [match for match, _ in first_matches.most_common()] # if we have fewer matches than goal, add more 'less good' matches if len(matches) < matches_min: for i in range(1, never_accept_this): @@ -100,7 +99,7 @@ def auto_complete(self, word: str, database) -> list: break prev_num = num - matches = matches + probably_keys + matches = matches + [match for match, _ in probably_keys.most_common()] return matches def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: From 8e734369b34ddeecd2731f60dd5c01e9ac107eb9 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 5 Sep 2025 19:21:58 +0200 Subject: [PATCH 28/47] better autocomplete performance when many long qgram matches --- .../bwutils/searchengine/metadata_search.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 43ae1d051..4c776235d 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -43,13 +43,12 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None: def auto_complete(self, word: str, database: Optional[str] = None) -> list: """Based on spellchecker, make more useful for autocompletions """ + count_occurence = lambda x: sum(self.word_to_identifier[x].values()) # count occurences of a word if len(word) <= 1: return [] self.database_id_manager(database) - count_occurence = lambda x: sum(self.word_to_identifier[x].values()) # count occurences of a word - matches_min = 2 # ideally we have at least this many alternatives matches_max = 4 # ideally don't much more than this many matches never_accept_this = 5 # values this edit distance or over always rejected @@ -58,7 +57,7 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list: # first, find possible matches quickly q_grams = self.text_to_positional_q_gram(word) - possible_matches = self.find_q_gram_matches(set(q_grams)) + possible_matches = self.find_q_gram_matches(set(q_grams), return_all=True) first_matches = Counter() other_matches = {} @@ -68,9 +67,8 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list: for row in possible_matches.itertuples(): if len(word) > len(row[1]) or word == row[1]: continue - test_word = row[1][:len(word)] # only find edit distance of first part of word - - edit_distance = self.osa_distance(word, test_word, cutoff=never_accept_this) + # find edit distance of same size strings + edit_distance = self.osa_distance(word, row[1][:len(word)], cutoff=never_accept_this) if len(row[1]) == 32 and edit_distance <= 1: probably_keys[row[1]] = 100 - edit_distance # keys need to be sorted on edit distance, not on occurence elif edit_distance == 0: @@ -102,7 +100,7 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list: matches = matches + [match for match, _ in probably_keys.most_common()] return matches - def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: + def find_q_gram_matches(self, q_grams: set, return_all: bool = False) -> pd.DataFrame: """Overwritten for extra database specific reduction of results. """ n_q_grams = len(q_grams) @@ -137,10 +135,13 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: max_q = max(matches["matches"]) # this has the most matching q-grams # determine how many results we want to keep based on how good our results are - min_q = min(max(max_q * 0.32, # have at least a third of q-grams of best match or... - max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? - 1)), # okay just do 1 q-gram if there are no more in the word - max_q) # never have min_q be over max_q + if not return_all: + min_q = min(max(max_q * 0.32, # have at least a third of q-grams of best match or... + max(n_q_grams * 0.5, # if more, at least half the q-grams in the query word? + 1)), # okay just do 1 q-gram if there are no more in the word + max_q) # never have min_q be over max_q + else: + min_q = 0 matches = matches[matches["matches"] >= min_q] matches = matches.sort_values(by="matches", ascending=False) From 2f078595a16278dd7390b0c3679a4db519b32e43 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 5 Sep 2025 21:46:24 +0200 Subject: [PATCH 29/47] resolve bug with removing identifier from searchengine leading to breaking search --- activity_browser/bwutils/searchengine/base.py | 8 ++++++-- tests/test_search.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index fcee9e4cd..635d58b20 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -292,7 +292,6 @@ def remove_identifier(self, identifier, logging=True) -> None: raise Exception( f"Identifier '{identifier}' does not exist in the search data, cannot remove identifier that do not exist.") - # remove from df self.df = self.df.drop(identifier) # find words that may need to be removed @@ -309,10 +308,15 @@ def remove_identifier(self, identifier, logging=True) -> None: # this q_gram is only used in this word, # remove it del self.q_gram_to_word[q_gram] + elif len(self.q_gram_to_word[q_gram]) > 1: + # this q_gram is used in multiple words, only remove the word from the q_gram + del self.q_gram_to_word[q_gram][word] del self.word_to_q_grams[word] else: - # remove the identifier from the dict + # this word is found in multiple identifiers + # word_to_q_gram and q_gram_to_word do not need to be changed, the word still exists + # remove the identifier the word in word_to_identifier del self.word_to_identifier[word][identifier] # finally, remove the identifier del self.identifier_to_word[identifier] diff --git a/tests/test_search.py b/tests/test_search.py index 6d63c14ee..727870359 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -9,8 +9,8 @@ def data_for_test(): ["b", "coal production", "something"], ["c", "coal production", "coat"], ["d", "coal hello production", "something"], - ["e", "dont find me", "hello world"], - ["f", "coat", "another word"], + ["e", "dont zzfind me", "hello world"], + ["f", "coat", "zzanother word"], ["g", "coalispartofthisword", "things"], ["h", "coal", "coal"], ], @@ -199,6 +199,12 @@ def test_search_remove_identifier(): se.remove_identifier(identifier="a") assert se.search("coal production") == ["c", "b", "d", "h", "f", "g"] + # now search on something only in a column we later remove + assert se.search("find") == ["e"] + se.remove_identifier(identifier="e") + assert se.search("find") == [] + + def test_search_change_identifier(): """Do tests for changing identifier.""" From 6e5d1cbb9813bf9b501865d32648979599edcde2 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 5 Sep 2025 22:47:18 +0200 Subject: [PATCH 30/47] add functionality for adding, changing and removing identifiers (except full databases) --- activity_browser/bwutils/metadata.py | 49 ++++++++++++++++--- activity_browser/bwutils/searchengine/base.py | 2 +- .../bwutils/searchengine/metadata_search.py | 25 ++++++++-- tests/test_search.py | 1 - 4 files changed, 64 insertions(+), 13 deletions(-) diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index 70ab0606f..04498e991 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -67,6 +67,12 @@ def __init__(self, parent=None): self.moveToThread(application.thread()) self.connect_signals() + self.search_engine_whitelist = [ + "id", "name", "synonyms", "unit", "key", "database", # generic + "CAS number", "categories", # biosphere specific + "product", "reference product", "classifications", "location", "properties" # activity specific + ] + def connect_signals(self): signals.project.changed.connect(self.sync) signals.node.changed.connect(self.on_node_changed) @@ -76,11 +82,29 @@ def connect_signals(self): def on_node_deleted(self, ds): try: - self.dataframe.drop(ds.key, inplace=True) + self.dataframe = self.dataframe.drop(ds.key) + self.remove_identifier_from_search_engine(ds) self.synced.emit() except KeyError: pass + def remove_identifier_from_search_engine(self, ds, reset_db_ids=True, logging=True): + data = model_to_dict(ds) + identifier = data["id"] + if identifier in self.search_engine.database_id_manager(data["database"]): + self.search_engine.remove_identifier(identifier, logging=logging) + if reset_db_ids: + self.search_engine.reset_database_id_manager() + + def remove_identifiers_from_search_engine(self, identifiers): + t = time() + for identifier in identifiers: + self.remove_identifier_from_search_engine(identifier, reset_db_ids=False, logging=False) + self.search_engine.reset_database_id_manager() + log.debug(f"Search index updated in {time() - t:.2f} seconds " + f"for {len(identifiers)} removed items " + f"({len(self.search_engine.df)} items ({self.search_engine.size_of_index()}) currently).") + def on_node_changed(self, new, old): data_raw = model_to_dict(new) data = data_raw.pop("data") @@ -98,13 +122,28 @@ def on_node_changed(self, new, old): for col in [col for col in data.columns if col not in self.dataframe.columns]: self.dataframe[col] = pd.NA self.dataframe.loc[new.key] = data.loc[new.key] + self.change_identifier_in_search_engine(identifier=data.loc[new.key, "id"], data=data.loc[[new.key]]) elif self.dataframe.empty: # an activity has been added and the dataframe was empty self.dataframe = data + self.add_identifier_to_search_engine(data) else: # an activity has been added and needs to be concatenated to existing metadata self.dataframe = pd.concat([self.dataframe, data], join="outer") + self.add_identifier_to_search_engine(data) self.thread().eventDispatcher().awake.connect(self._emitSyncLater, Qt.ConnectionType.UniqueConnection) + def add_identifier_to_search_engine(self, data: pd.DataFrame): + search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist)) # intersection becomes columns + data = data[search_engine_cols] + self.search_engine.add_identifier(data.copy()) + self.search_engine.reset_database_id_manager() + + def change_identifier_in_search_engine(self, identifier, data: pd.DataFrame): + search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist)) # intersection becomes columns + data = data[search_engine_cols] + self.search_engine.change_identifier(identifier=identifier, data=data.copy()) + self.search_engine.reset_database_id_manager() + @property def databases(self): return set(self.dataframe.get("database", [])) @@ -354,13 +393,9 @@ def _unpacker(self, classifications: list, system: str) -> list: return system_classifications def init_search(self): - allowed_cols = [ - "id", "name", "synonyms", "unit", "key", "database", # generic - "CAS number", "categories", # biosphere specific - "product", "reference product", "classifications", "location", "properties" # activity specific - ] - self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=allowed_cols) + + self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=self.search_engine_whitelist) def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False): return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 635d58b20..f0f34261b 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -262,7 +262,7 @@ def add_identifier(self, data: pd.DataFrame) -> None: # add cols to new data that are missing for col in df_cols: if col not in data.columns: - data[col] = [""] * len(data) + data.loc[:, col] = [""] * len(data) # re-order cols, first existing, then new df_col_set = set(df_cols) new_cols = [col for col in data.columns if col not in self.columns if col not in df_col_set] diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 4c776235d..3e70a3cfd 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -23,17 +23,34 @@ def database_id_manager(self, database): self.all_database_ids[database] = self.database_ids else: self.database_ids = None + return self.database_ids def reset_database_id_manager(self): - del self.all_database_ids - del self.database_ids + if hasattr(self, "all_database_ids"): + del self.all_database_ids + if hasattr(self, "database_ids"): + del self.database_ids def add_identifier(self, data: pd.DataFrame) -> None: super().add_identifier(data) self.reset_database_id_manager() - def remove_identifier(self, identifier, logging=True) -> None: - super().remove_identifier(identifier, logging=logging) + + def remove_identifiers(self, identifiers, logging=True) -> None: + t = time() + + identifiers = set(identifiers) + current_identifiers = set(self.df.index.to_list()) + identifiers = identifiers | current_identifiers # only remove identifiers currently in the data + if len(identifiers) == 0: + return + + for identifier in identifiers: + super().remove_identifier(identifier, logging=False) + + if logging: + log.debug(f"Search index updated in {time() - t:.2f} seconds " + f"for {len(identifiers)} removed items ({len(self.df)} items ({self.size_of_index()}) currently).") self.reset_database_id_manager() def change_identifier(self, identifier, data: pd.DataFrame) -> None: diff --git a/tests/test_search.py b/tests/test_search.py index 727870359..0c40f4340 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -205,7 +205,6 @@ def test_search_remove_identifier(): assert se.search("find") == [] - def test_search_change_identifier(): """Do tests for changing identifier.""" df = data_for_test() From 0bd672c66a0f94d11b49ec7456e1846fdacaef2a Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Sat, 6 Sep 2025 16:29:52 +0200 Subject: [PATCH 31/47] add functionality for adding and removing full databases --- activity_browser/bwutils/metadata.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index 04498e991..e8a1523ce 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -88,18 +88,17 @@ def on_node_deleted(self, ds): except KeyError: pass - def remove_identifier_from_search_engine(self, ds, reset_db_ids=True, logging=True): + def remove_identifier_from_search_engine(self, ds): data = model_to_dict(ds) identifier = data["id"] if identifier in self.search_engine.database_id_manager(data["database"]): - self.search_engine.remove_identifier(identifier, logging=logging) - if reset_db_ids: - self.search_engine.reset_database_id_manager() + self.search_engine.remove_identifier(identifier) + self.search_engine.reset_database_id_manager() def remove_identifiers_from_search_engine(self, identifiers): t = time() for identifier in identifiers: - self.remove_identifier_from_search_engine(identifier, reset_db_ids=False, logging=False) + self.search_engine.remove_identifier(identifier, logging=False) self.search_engine.reset_database_id_manager() log.debug(f"Search index updated in {time() - t:.2f} seconds " f"for {len(identifiers)} removed items " @@ -195,7 +194,10 @@ def sync_databases(self) -> None: for db_name in [x for x in self.databases if x not in bd.databases]: # deleted databases + remove_search_engine = self.dataframe[self.dataframe["database"] == db_name]["id"] self.dataframe.drop(db_name, level=0, inplace=True) + if len(remove_search_engine) > 0: + self.remove_identifiers_from_search_engine(remove_search_engine) sync = True for db_name in [x for x in bd.databases if x not in self.databases]: @@ -208,7 +210,7 @@ def sync_databases(self) -> None: self.dataframe = data else: self.dataframe = pd.concat([self.dataframe, data], join="outer") - + self.add_identifier_to_search_engine(data) sync = True if sync: From 4791c5647addbc212850a1c5346031fd7dd1eaca Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Sun, 7 Sep 2025 09:39:48 +0200 Subject: [PATCH 32/47] improve matching speed after metadata conversion to ProductModel --- activity_browser/bwutils/searchengine/base.py | 15 +++++++++++---- .../layouts/panes/database_products.py | 17 ++++++++--------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index f0f34261b..4bcc3d45d 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -488,7 +488,7 @@ def find_q_gram_matches(self, q_grams: set) -> pd.DataFrame: return matches.iloc[:min(len(matches), 2500), :] # return at most this many results - def spell_check(self, text: str) -> OrderedDict: + def spell_check(self, text: str, skip_len=1) -> OrderedDict: """Create an OrderedDict of each word in the text (space separated) with as values possible alternatives. @@ -524,6 +524,13 @@ def spell_check(self, text: str) -> OrderedDict: words = [self.clean_text(word) for word in words] for word in words: + if len(word) <= skip_len: # dont look for alternatives for text this short + word_results[word] = [] + continue + + # reduce acceptable edit distance with short words + dont_accept = int(round(max(1, min((len(word) * 0.66), never_accept_this)), 0)) + # first, find possible matches quickly q_grams = self.text_to_positional_q_gram(word) possible_matches = self.find_q_gram_matches(set(q_grams)) @@ -534,13 +541,13 @@ def spell_check(self, text: str) -> OrderedDict: # now, refine with edit distance for row in possible_matches.itertuples(): - edit_distance = self.osa_distance(word, row[1], cutoff=never_accept_this) + edit_distance = self.osa_distance(word, row[1], cutoff=dont_accept) if edit_distance == 0: continue # we are looking for alternatives only, not the exact word elif edit_distance <= always_accept_this: first_matches[row[1]] = count_occurence(row[1]) - elif edit_distance < never_accept_this: + elif edit_distance < dont_accept: if not other_matches.get(edit_distance): other_matches[edit_distance] = Counter() other_matches[edit_distance][row[1]] = count_occurence(row[1]) @@ -551,7 +558,7 @@ def spell_check(self, text: str) -> OrderedDict: matches = [match for match, _ in first_matches.most_common()] # if we have fewer matches than goal, add more 'less good' matches if len(matches) < matches_min: - for i in range(always_accept_this + 1, never_accept_this): + for i in range(always_accept_this + 1, dont_accept): # iteratively increase matches with 'worse' results so we hit goal of minimum alternatives if new := other_matches.get(i): prev_num = 10e100 diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py index 48de92aa1..680f4d2eb 100644 --- a/activity_browser/layouts/panes/database_products.py +++ b/activity_browser/layouts/panes/database_products.py @@ -493,19 +493,18 @@ def external_search(self, query): df = AB_metadata.dataframe[AB_metadata.dataframe["id"].isin(result_ids)].loc[:, ["id", "key"]] df = df.set_index("key", drop=True) translate_dict = df.to_dict()["id"] + result_keys = set(translate_dict.keys()) # convert the metadata id scores to row id scores row_scores = Counter() - df = self.dataframe.copy() - act_idx = set(df[df["activity_key"].isin(translate_dict.keys())].index.to_list()) - prd_idx = set(df[df["product_key"].isin(translate_dict.keys())].index.to_list()) - indices = act_idx | prd_idx # combine the two sets ('|' is a set union) - # iterate over the indices - for index in indices: - act_score = results.get(translate_dict.get(df.loc[index, "activity_key"]), 0) - prd_score = results.get(translate_dict.get(df.loc[index, "product_key"]), 0) - row_scores[index] = act_score + prd_score + match_df = self.dataframe[self.dataframe["activity_key"].isin(result_keys) | self.dataframe["product_key"].isin(result_keys)] + match_df = match_df.loc[:, ["activity_key", "product_key"]] + for row in match_df.itertuples(): + act_score = results.get(row[1], 0) + prd_score = results.get(row[2], 0) + row_scores[row[0]] = act_score + prd_score # finally only return the indices sorted_indices = [identifier[0] for identifier in row_scores.most_common()] + return sorted_indices From 532cac268a40e7d66f5ad2e11e7114424a99598c Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Sun, 7 Sep 2025 18:42:12 +0200 Subject: [PATCH 33/47] make autocomplete suggestions aware of context of other words in query, improving usefulness --- activity_browser/bwutils/searchengine/base.py | 2 +- .../bwutils/searchengine/metadata_search.py | 37 +++++++++++++++---- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 4bcc3d45d..ca36e452a 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -753,7 +753,7 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list: if return_counter: return all_identifiers # now sort on highest weights and make list type - sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] + sorted_identifiers = [identifier for identifier, _ in all_identifiers.most_common()] return sorted_identifiers def search(self, text) -> list: diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 3e70a3cfd..8e55d27cc 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -57,20 +57,43 @@ def change_identifier(self, identifier, data: pd.DataFrame) -> None: super().change_identifier(identifier, data) self.reset_database_id_manager() - def auto_complete(self, word: str, database: Optional[str] = None) -> list: + def auto_complete(self, word: str, context: Optional[set] = set(), database: Optional[str] = None) -> list: """Based on spellchecker, make more useful for autocompletions """ - count_occurence = lambda x: sum(self.word_to_identifier[x].values()) # count occurences of a word + def word_to_identifier_to_word(check_word): + # assumes context words are correctly spelled + if len(context) == 0: + return 1 + multiplier = 1 + for identifier in self.word_to_identifier[check_word]: + for context_word in context: + for spell_checked_context_word in spell_checked_context[context_word]: + if spell_checked_context_word in self.identifier_to_word[identifier]: + multiplier += 1 + if context_word not in self.word_to_identifier.keys(): + continue + if context_word in self.identifier_to_word[identifier]: + multiplier += 3 + return multiplier + + # count occurrences of a word, count double so word_to_identifier_to_word will never multiply by 1 + count_occurrence = lambda x: sum(self.word_to_identifier[x].values()) * 2 + if len(word) <= 1: return [] self.database_id_manager(database) + if len(context) > 0: + spell_checked_context = {} + for context_word in context: + spell_checked_context[context_word] = self.spell_check(context_word)[context_word][:5] + matches_min = 2 # ideally we have at least this many alternatives matches_max = 4 # ideally don't much more than this many matches - never_accept_this = 5 # values this edit distance or over always rejected + never_accept_this = 4 # values this edit distance or over always rejected # or max 2/3 of len(word) if less than never_accept_this - never_accept_this = int(round(min(never_accept_this, max(1, len(word) * (2 / 3))), 0)) + never_accept_this = int(round(max(1, min((len(word) * 0.66), never_accept_this)), 0)) # first, find possible matches quickly q_grams = self.text_to_positional_q_gram(word) @@ -89,11 +112,11 @@ def auto_complete(self, word: str, database: Optional[str] = None) -> list: if len(row[1]) == 32 and edit_distance <= 1: probably_keys[row[1]] = 100 - edit_distance # keys need to be sorted on edit distance, not on occurence elif edit_distance == 0: - first_matches[row[1]] = count_occurence(row[1]) - elif edit_distance < never_accept_this: + first_matches[row[1]] = count_occurrence(row[1]) * word_to_identifier_to_word(row[1]) + elif edit_distance < never_accept_this and len(first_matches) < matches_min: if not other_matches.get(edit_distance): other_matches[edit_distance] = Counter() - other_matches[edit_distance][row[1]] = count_occurence(row[1]) + other_matches[edit_distance][row[1]] = count_occurrence(row[1]) * word_to_identifier_to_word(row[1]) else: continue From 42c359306a7251d4fc706fff112c75f26feb0844 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Sun, 7 Sep 2025 18:56:41 +0200 Subject: [PATCH 34/47] ProductModel suggestions now include literal matches better --- activity_browser/bwutils/metadata.py | 12 +++---- .../bwutils/searchengine/metadata_search.py | 2 +- .../layouts/panes/database_products.py | 36 ++++++++++++++----- activity_browser/ui/widgets/line_edit.py | 3 +- 4 files changed, 37 insertions(+), 16 deletions(-) diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index e8a1523ce..32afb629b 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -395,19 +395,19 @@ def _unpacker(self, classifications: list, system: str) -> list: return system_classifications def init_search(self): - - self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=self.search_engine_whitelist) - def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False): - return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter) + def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True): + # we do fuzzy search as we re-index results (combining products and activities) for database_products table + # anyway, so including literal results quite literally is a waste of time at this point + return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter, logging=logging) def search(self, query:str): return self.search_engine.search(query) - def auto_complete(self, word:str, database: Optional[str] = None): + def auto_complete(self, word:str, context: Optional[set] = None, database: Optional[str] = None): word = self.search_engine.clean_text(word) - completions = self.search_engine.auto_complete(word, database) + completions = self.search_engine.auto_complete(word, context=context, database=database) return completions AB_metadata = MetaDataStore() diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 8e55d27cc..ff580b88b 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -87,7 +87,7 @@ def word_to_identifier_to_word(check_word): if len(context) > 0: spell_checked_context = {} for context_word in context: - spell_checked_context[context_word] = self.spell_check(context_word)[context_word][:5] + spell_checked_context[context_word] = self.spell_check(context_word).get(context_word, [])[:5] matches_min = 2 # ideally we have at least this many alternatives matches_max = 4 # ideally don't much more than this many matches diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py index 680f4d2eb..caf83aace 100644 --- a/activity_browser/layouts/panes/database_products.py +++ b/activity_browser/layouts/panes/database_products.py @@ -485,7 +485,9 @@ def values_from_indices(key: str, indices: list[QtCore.QModelIndex]): return values def external_search(self, query): - results = AB_metadata.db_search(query, database=self.external_col_name, return_counter=True) + t = time() + results = AB_metadata.db_search(query, database=self.external_col_name, return_counter=True, logging=False) + t2 = time() # extract a dict with 'key' as key and 'id' as values from the metadata result_ids = set(results.keys()) @@ -496,15 +498,33 @@ def external_search(self, query): result_keys = set(translate_dict.keys()) # convert the metadata id scores to row id scores - row_scores = Counter() + best_row_scores = Counter() + remain_row_scores = Counter() match_df = self.dataframe[self.dataframe["activity_key"].isin(result_keys) | self.dataframe["product_key"].isin(result_keys)] - match_df = match_df.loc[:, ["activity_key", "product_key"]] + cols = ["activity_key", "product_key"] + cols = cols + [col for col in match_df.columns if col not in cols] + match_df = match_df.loc[:, cols] for row in match_df.itertuples(): - act_score = results.get(row[1], 0) - prd_score = results.get(row[2], 0) - row_scores[row[0]] = act_score + prd_score + # score higher if exact words occur + act_score = results.get(translate_dict.get(row[1]), 0) + prd_score = results.get(translate_dict.get(row[2]), 0) + row_text = str(row[1:]) + for query_word in query.split(" "): + if amt := query.count(query_word) > 0 and len(query_word) > 0: + best_row_scores[row[0]] = (act_score + prd_score) * amt + if query in row_text: + score = (best_row_scores.get(row[0], 0) + act_score + prd_score) * 2 + best_row_scores[row[0]] = score + else: + remain_row_scores[row[0]] = act_score + prd_score # finally only return the indices - sorted_indices = [identifier[0] for identifier in row_scores.most_common()] - + best_sorted_indices = [identifier for identifier, _ in best_row_scores.most_common()] + remain_sorted_indices = [identifier for identifier, _ in remain_row_scores.most_common()] + sorted_indices = best_sorted_indices + remain_sorted_indices + log.debug( + f"ProductModel search in '{self.external_col_name}' ({len(self.dataframe)} items) " + f"found {len(sorted_indices)} ({len(best_sorted_indices)} literal) results " + f"for '{query}' in {time() - t:.2f} seconds ({t2 - t:.2f}s actual search, {time() - t2:.2f}s reorder for table)" + ) return sorted_indices diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 9545c5943..356b5707e 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -160,9 +160,10 @@ def _set_items(self, text=None): if not current_word: self.model.setStringList([]) return + context = set((text[:start] + text[end:]).split(" ")) # get suggestions for the current word - alternatives = AB_metadata.auto_complete(current_word, database=self.database_name) + alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name) alternatives = alternatives[:6] # at most 6, though we should get ~3 usually # replace the current word with each alternative items = [] From 4ec98fb890ce9d484264c0014db05977efb4afae Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Mon, 8 Sep 2025 11:56:04 +0200 Subject: [PATCH 35/47] Update line-edit autocompleter base class --- activity_browser/ui/widgets/line_edit.py | 118 +++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 356b5707e..11fd8b793 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -180,3 +180,121 @@ def _set_items(self, text=None): ) self.popup.setMaximumHeight(max_height) +class ABTextEdit(QtWidgets.QTextEdit): + textChangedDebounce: SignalInstance = Signal(str) + _debounce_ms = 250 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._debounce_timer = QTimer(self, singleShot=True) + + self.textChanged.connect(self._set_debounce) + self._debounce_timer.timeout.connect(self._emit_debounce) + + def _set_debounce(self): + self._debounce_timer.setInterval(self._debounce_ms) + self._debounce_timer.start() + + def _emit_debounce(self): + self.textChangedDebounce.emit(self.toPlainText()) + + def debounce(self): + return self._debounce_ms + + def setDebounce(self, ms: int): + self._debounce_ms = ms + + +class MetaDataAutoCompleteLineEdit(ABTextEdit): + """Line Edit with MetaDataStore completer attached""" + + def __init__(self, parent=None): + super().__init__(parent=parent) + self.database_name = "" + + # autocompleter settings + self.model = QStringListModel() + self.completer = QCompleter(self.model) + self.completer.setWidget(self) + self.popup = self.completer.popup() + self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) + self.completer.setPopup(self.popup) + # allow all items in popup list + self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) + self.completer.activated.connect(self._insert_auto_complete) + + self.textChanged.connect(self.sanitize_input) + + def sanitize_input(self): + text = self.toPlainText() + text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text) + self.blockSignals(True) + self.clear() + self.insertPlainText(text) + self.blockSignals(False) + if len(text) == 0: + self.popup.close() + + def _insert_auto_complete(self, completion): + self.clear() + self.insertPlainText(completion) + self.popup.close() + self._set_items() + + def _set_items(self): + text = self.toPlainText() + + # find the start and end of the word under the cursor + cursor_pos = self.textCursor().position() + start = cursor_pos + while start > 0 and text[start - 1] != " ": + start -= 1 + end = cursor_pos + while end < len(text) and text[end] != " ": + end += 1 + current_word = text[start:end] + if not current_word: + self.model.setStringList([]) + return + context = set((text[:start] + text[end:]).split(" ")) + + # get suggestions for the current word + alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name) + alternatives = alternatives[:6] # at most 6, though we should get ~3 usually + # replace the current word with each alternative + items = [] + for alt in alternatives: + new_text = text[:start] + alt + text[end:] + items.append(new_text) + print(text, items) + if len(items) == 0: + return + + self.model.setStringList(items) + # set correct height now that we have data + max_height = max( + 20, + self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth() + ) + self.popup.setMaximumHeight(max_height) + self.completer.complete() + + def keyPressEvent(self, event): + key = event.key() + + if key in (Qt.Key_Enter, Qt.Key_Return, Qt.Key_Tab): + # insert an autocomplete item + # capture enter/return/tab key + index = self.popup.currentIndex() + selected_text = index.data(Qt.DisplayRole) + self.completer.activated.emit(selected_text + " ") + return + elif key in (Qt.Key_Space,): + self.popup.close() + + super().keyPressEvent(event) + + # trigger on text input keys + if event.text(): # filters out non-text keys like arrows, shift, etc. + self._set_items() From 72e01d1850d446aa879b5f29a062e86b36ed5bbe Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 9 Sep 2025 10:25:10 +0200 Subject: [PATCH 36/47] Add marking of unknown words to search --- activity_browser/ui/widgets/line_edit.py | 99 ++++++++++++++++++++---- 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 11fd8b793..7095a5f88 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -1,6 +1,6 @@ from qtpy import QtWidgets from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance, QStringListModel, Qt -from qtpy.QtGui import QTextFormat +from qtpy.QtGui import QTextFormat, QSyntaxHighlighter, QTextCharFormat, QTextDocument, QTextCursor from qtpy.QtWidgets import QCompleter from activity_browser.bwutils import AB_metadata @@ -180,6 +180,29 @@ def _set_items(self, text=None): ) self.popup.setMaximumHeight(max_height) + +class UnknownWordHighlighter(QSyntaxHighlighter): + def __init__(self, parent: QTextDocument, known_words: set): + super().__init__(parent) + self.known_words = known_words + + # define the format for unknown words + self.unknown_format = QTextCharFormat() + self.unknown_format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline) + self.unknown_format.setUnderlineColor(Qt.red) + + def highlightBlock(self, text: str): + if text.startswith("="): + return + words = text.split() + index = 0 + for word in words: + word_len = len(word) + if word and word not in self.known_words: + self.setFormat(index, word_len, self.unknown_format) + index += word_len + 1 # +1 for the space + + class ABTextEdit(QtWidgets.QTextEdit): textChangedDebounce: SignalInstance = Signal(str) _debounce_ms = 250 @@ -212,6 +235,7 @@ class MetaDataAutoCompleteLineEdit(ABTextEdit): def __init__(self, parent=None): super().__init__(parent=parent) self.database_name = "" + self.auto_complete_word = "" # autocompleter settings self.model = QStringListModel() @@ -225,27 +249,64 @@ def __init__(self, parent=None): self.completer.activated.connect(self._insert_auto_complete) self.textChanged.connect(self.sanitize_input) + self.highlighter = UnknownWordHighlighter(self.document(), set()) + self.cursorPositionChanged.connect(self._set_items) def sanitize_input(self): + self._debounce_timer.stop() text = self.toPlainText() - text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text) - self.blockSignals(True) - self.clear() - self.insertPlainText(text) - self.blockSignals(False) + clean_text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text) + + if clean_text != text: + cursor = self.textCursor() + position = cursor.position() + self.blockSignals(True) + self.clear() + self.insertPlainText(clean_text) + self.blockSignals(False) + cursor.setPosition(min(position, len(text))) + self.setTextCursor(cursor) + + known_words = set() + for identifier in AB_metadata.search_engine.database_id_manager(self.database_name): + known_words.update(AB_metadata.search_engine.identifier_to_word[identifier].keys()) + self.highlighter.known_words = known_words + if len(text) == 0: self.popup.close() + self._set_debounce() def _insert_auto_complete(self, completion): - self.clear() - self.insertPlainText(completion) + cursor = self.textCursor() + position = cursor.position() + text = self.toPlainText() + + start = position + while start > 0 and text[start - 1] != " ": + start -= 1 + new_position = start + len(completion) + 1 + + # select the word under the cursor + cursor.select(QTextCursor.WordUnderCursor) + # replace it with the completion + cursor.insertText(completion + " ") + # set the updated cursor to end of inserted word + space + cursor.setPosition(min(new_position, len(text[:start] + completion) + 1)) + self.setTextCursor(cursor) + self.popup.close() - self._set_items() + self.auto_complete_word = "" + self.model.setStringList([]) def _set_items(self): text = self.toPlainText() + if text.startswith("="): + self.model.setStringList([]) + self.auto_complete_word = "" + self.popup.close() + return - # find the start and end of the word under the cursor + # find the start and end of the word under the cursor cursor_pos = self.textCursor().position() start = cursor_pos while start > 0 and text[start - 1] != " ": @@ -257,8 +318,12 @@ def _set_items(self): if not current_word: self.model.setStringList([]) return - context = set((text[:start] + text[end:]).split(" ")) + if self.auto_complete_word == current_word: + # avoid unnecessary auto_complete calls if the current word didnt change + return + self.auto_complete_word = current_word + context = set((text[:start] + text[end:]).split(" ")) # get suggestions for the current word alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name) alternatives = alternatives[:6] # at most 6, though we should get ~3 usually @@ -266,9 +331,11 @@ def _set_items(self): items = [] for alt in alternatives: new_text = text[:start] + alt + text[end:] - items.append(new_text) - print(text, items) + # items.append(new_text) + items.append(alt) + print(cursor_pos, text, items) if len(items) == 0: + self.popup.close() return self.model.setStringList(items) @@ -287,8 +354,8 @@ def keyPressEvent(self, event): # insert an autocomplete item # capture enter/return/tab key index = self.popup.currentIndex() - selected_text = index.data(Qt.DisplayRole) - self.completer.activated.emit(selected_text + " ") + completion_text = index.data(Qt.DisplayRole) + self.completer.activated.emit(completion_text) return elif key in (Qt.Key_Space,): self.popup.close() @@ -296,5 +363,5 @@ def keyPressEvent(self, event): super().keyPressEvent(event) # trigger on text input keys - if event.text(): # filters out non-text keys like arrows, shift, etc. + if event.text() or key in (Qt.LeftArrow, Qt.RightArrow): # filters out non-text keys like arrows, shift, etc. self._set_items() From fbeb4554bd7f304be61ebb50d21671d648497687 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 9 Sep 2025 10:25:37 +0200 Subject: [PATCH 37/47] drop literal search results --- activity_browser/bwutils/searchengine/base.py | 13 ++++++------ .../bwutils/searchengine/metadata_search.py | 12 +++++------ .../layouts/panes/database_products.py | 21 ++++--------------- 3 files changed, 15 insertions(+), 31 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index ca36e452a..3d3ffe18c 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -516,13 +516,13 @@ def spell_check(self, text: str, skip_len=1) -> OrderedDict: never_accept_this = 4 # values this edit distance or over always rejected # make list of unique words + text = self.clean_text(text) words = OrderedDict() for word in text.split(" "): - words[word] = False + if len(word) != 0: + words[word] = False words = words.keys() - words = [self.clean_text(word) for word in words] - for word in words: if len(word) <= skip_len: # dont look for alternatives for text this short word_results[word] = [] @@ -703,10 +703,9 @@ def fuzzy_search(self, text: str, return_counter: bool = False) -> list: # finally, make this a Counter (with each item=1) so we can properly weigh things later query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if query_to_identifier.get(q_word, False)] - if len(query_id_sets) > 0: - query_identifier_set = set.intersection(*query_id_sets) - else: - query_identifier_set = set() + if len(query_id_sets) == 0: + continue + query_identifier_set = set.intersection(*query_id_sets) if len(query_identifier_set) == 0: # there is no match for this combination of query words, skip break diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index ff580b88b..374ca56e0 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -61,7 +61,6 @@ def auto_complete(self, word: str, context: Optional[set] = set(), database: Opt """Based on spellchecker, make more useful for autocompletions """ def word_to_identifier_to_word(check_word): - # assumes context words are correctly spelled if len(context) == 0: return 1 multiplier = 1 @@ -73,7 +72,7 @@ def word_to_identifier_to_word(check_word): if context_word not in self.word_to_identifier.keys(): continue if context_word in self.identifier_to_word[identifier]: - multiplier += 3 + multiplier += 4 return multiplier # count occurrences of a word, count double so word_to_identifier_to_word will never multiply by 1 @@ -105,7 +104,7 @@ def word_to_identifier_to_word(check_word): # now, refine with edit distance for row in possible_matches.itertuples(): - if len(word) > len(row[1]) or word == row[1]: + if word == row[1]: continue # find edit distance of same size strings edit_distance = self.osa_distance(word, row[1][:len(word)], cutoff=never_accept_this) @@ -253,10 +252,9 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter # finally, make this a Counter (with each item=1) so we can properly weigh things later query_id_sets = [set(query_to_identifier.get(q_word)) for q_word in query if query_to_identifier.get(q_word, False)] - if len(query_id_sets) > 0: - query_identifier_set = set.intersection(*query_id_sets) - else: - query_identifier_set = set() + if len(query_id_sets) == 0: + continue + query_identifier_set = set.intersection(*query_id_sets) if len(query_identifier_set) == 0: # there is no match for this combination of query words, skip break diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py index caf83aace..86228490a 100644 --- a/activity_browser/layouts/panes/database_products.py +++ b/activity_browser/layouts/panes/database_products.py @@ -498,33 +498,20 @@ def external_search(self, query): result_keys = set(translate_dict.keys()) # convert the metadata id scores to row id scores - best_row_scores = Counter() - remain_row_scores = Counter() + row_scores = Counter() match_df = self.dataframe[self.dataframe["activity_key"].isin(result_keys) | self.dataframe["product_key"].isin(result_keys)] cols = ["activity_key", "product_key"] - cols = cols + [col for col in match_df.columns if col not in cols] match_df = match_df.loc[:, cols] for row in match_df.itertuples(): - # score higher if exact words occur act_score = results.get(translate_dict.get(row[1]), 0) prd_score = results.get(translate_dict.get(row[2]), 0) - row_text = str(row[1:]) - for query_word in query.split(" "): - if amt := query.count(query_word) > 0 and len(query_word) > 0: - best_row_scores[row[0]] = (act_score + prd_score) * amt - if query in row_text: - score = (best_row_scores.get(row[0], 0) + act_score + prd_score) * 2 - best_row_scores[row[0]] = score - else: - remain_row_scores[row[0]] = act_score + prd_score + row_scores[row[0]] = act_score + prd_score # finally only return the indices - best_sorted_indices = [identifier for identifier, _ in best_row_scores.most_common()] - remain_sorted_indices = [identifier for identifier, _ in remain_row_scores.most_common()] - sorted_indices = best_sorted_indices + remain_sorted_indices + sorted_indices = [identifier for identifier, _ in row_scores.most_common()] log.debug( f"ProductModel search in '{self.external_col_name}' ({len(self.dataframe)} items) " - f"found {len(sorted_indices)} ({len(best_sorted_indices)} literal) results " + f"found {len(sorted_indices)} results " f"for '{query}' in {time() - t:.2f} seconds ({t2 - t:.2f}s actual search, {time() - t2:.2f}s reorder for table)" ) return sorted_indices From 59e8e188066011894438dcf99e808cacd755af89 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 9 Sep 2025 12:23:40 +0200 Subject: [PATCH 38/47] marginal speed increases for initializing/updating for base class --- activity_browser/bwutils/searchengine/base.py | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 3d3ffe18c..a6292c874 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -1,7 +1,7 @@ from itertools import permutations, chain import itertools import functools -from collections import Counter, OrderedDict +from collections import Counter, OrderedDict, defaultdict from logging import getLogger from time import time from typing import Iterable, Optional @@ -99,11 +99,17 @@ def update_index(self, update_df: pd.DataFrame) -> None: def update_dict(update_me: dict, new: dict) -> dict: """Update a dict of counters with new dict of counters.""" - for dict_key, _counter in new.items(): - if dict_key in update_me: - update_me[dict_key].update(_counter) - else: - update_me[dict_key] = _counter + # set to empty set if we know update_me is empty, otherwise, find set intersection + update_keys = set() if len(update_me) == 0 else new.keys() & update_me.keys() + if len(update_keys) == 0: + new_data = new + else: + for update_key in update_keys: + update_me[update_key].update(new[update_key]) + new_data = {key: value for key, value in new.items() if key not in update_keys} + # finally add any completely new data + # update_me.update(new_data) + update_me = update_me | new_data return update_me t = time() @@ -112,8 +118,10 @@ def update_dict(update_me: dict, new: dict) -> dict: # identifier to word and df i2w, update_df = self.words_in_df(update_df) self.identifier_to_word = update_dict(self.identifier_to_word, i2w) + for col in [col for col in update_df.columns if col not in self.df]: + col_data = [""] * len(self.df) + self.df[col] = col_data self.df = pd.concat([self.df, update_df]) - self.df = self.df.fillna("") # ensure we don't add unwanted NA through concatenations # word to identifier w2i = self.reverse_dict_many_to_one(i2w) @@ -126,7 +134,6 @@ def update_dict(update_me: dict, new: dict) -> dict: # q-gram to word q2w = self.reverse_dict_many_to_one(w2q) self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w) - size_new = len(self.df) size_dif = size_new - size_old size_msg = (f"{size_dif} changed items at {int(round(size_dif/(time() - t), 0))} items/sec " @@ -153,13 +160,12 @@ def text_to_positional_q_gram(self, text: str) -> list: Note: these are technically _positional_ q-grams, but we don't use their positions currently. """ q = self.q - + n = len(text) # just return a single-item list if the text is equal or shorter than q # else, generate q-grams - if len(text) <= q: + if n <= q: return [text] - else: - return [text[i:i + q] for i in range(len(text) - q + 1)] + return list(text[i:i + q] for i in range(n - q + 1)) def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: """Return a dict of {identifier: word} for df.""" @@ -176,39 +182,37 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: col.append(line) identifier_word_dict[row[0]] = Counter(line.split(" ")) return_df["query_col"] = col + return_df = return_df.fillna("") # ensure we don't add unwanted NA in new data return identifier_word_dict, return_df def reverse_dict_many_to_one(self, dictionary: dict) -> dict: """Reverse a dictionary of Counter objects.""" - reverse = {} + reverse = defaultdict(Counter) for identifier, counter_object in dictionary.items(): for countable, count in counter_object.items(): - if countable not in reverse: - reverse[countable] = Counter() reverse[countable][identifier] += count - return reverse + return dict(reverse) def list_to_q_grams(self, word_list: Iterable) -> dict: """Convert a list of unique words to a dict with Counter objects. Number will be the occurrences of that q-gram in that word. - q_gram_dict = { + return = { "word": Counter( "wo": 1 "or": 1 "rd": 1 - ) + ), + ... } - """ - q_gram_dict = {} - - for word in word_list: - q_gram_dict[word] = Counter(self.text_to_positional_q_gram(word)) - - return q_gram_dict + text_to_q_gram = self.text_to_positional_q_gram + return { + word: Counter(text_to_q_gram(word)) + for word in word_list + } def word_in_index(self, word: str) -> bool: """Convenience function to check if a single word is in the search index.""" From e04c20e2dfc21699b2c959e15ee2226797b77831 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 9 Sep 2025 14:24:33 +0200 Subject: [PATCH 39/47] marginal speed increases for initializing/updating for base class --- activity_browser/bwutils/searchengine/base.py | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index a6292c874..91cc64e12 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -116,12 +116,15 @@ def update_dict(update_me: dict, new: dict) -> dict: size_old = len(self.df) # identifier to word and df + t2 = time() i2w, update_df = self.words_in_df(update_df) + log.debug(f">>> DF {time() - t2:.2f}.") self.identifier_to_word = update_dict(self.identifier_to_word, i2w) for col in [col for col in update_df.columns if col not in self.df]: col_data = [""] * len(self.df) self.df[col] = col_data self.df = pd.concat([self.df, update_df]) + log.debug(f">>> tot {time() - t2:.2f}.") # word to identifier w2i = self.reverse_dict_many_to_one(i2w) @@ -170,21 +173,15 @@ def text_to_positional_q_gram(self, text: str) -> list: def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: """Return a dict of {identifier: word} for df.""" - df = df if any(df) else self.df - return_df = df.copy() - - df = df.iloc[:, self.searchable_columns] - identifier_word_dict = {} - col = [] - - for row in df.itertuples(index=True): - line = self.clean_text(" | ".join(row[1:])) - col.append(line) - identifier_word_dict[row[0]] = Counter(line.split(" ")) - return_df["query_col"] = col - return_df = return_df.fillna("") # ensure we don't add unwanted NA in new data - - return identifier_word_dict, return_df + df = df if df is not None else self.df.copy() + df = df.fillna("") # avoid nan + # assemble query_col + df["query_col"] = df.iloc[:, self.searchable_columns].astype(str).agg(" | ".join, axis=1) + # clean all text at once using vectorized operations + df["query_col"] = df["query_col"].apply(self.clean_text) + # build the identifier_word_dict dictionary + identifier_word_dict = df["query_col"].apply(lambda text: Counter(text.split(" "))).to_dict() + return identifier_word_dict, df def reverse_dict_many_to_one(self, dictionary: dict) -> dict: """Reverse a dictionary of Counter objects.""" From 1bedc53ff507a8952702736d2688569ac87dc3e9 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 9 Sep 2025 17:12:09 +0200 Subject: [PATCH 40/47] Implement multiprocessing to increase speed for text cleaning during indexing. --- activity_browser/bwutils/searchengine/base.py | 46 ++++++++++++++----- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 91cc64e12..5a7752e3a 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -3,6 +3,8 @@ import functools from collections import Counter, OrderedDict, defaultdict from logging import getLogger +import math +import multiprocessing as mp from time import time from typing import Iterable, Optional import pandas as pd @@ -114,26 +116,16 @@ def update_dict(update_me: dict, new: dict) -> dict: t = time() size_old = len(self.df) - # identifier to word and df - t2 = time() i2w, update_df = self.words_in_df(update_df) - log.debug(f">>> DF {time() - t2:.2f}.") self.identifier_to_word = update_dict(self.identifier_to_word, i2w) - for col in [col for col in update_df.columns if col not in self.df]: - col_data = [""] * len(self.df) - self.df[col] = col_data self.df = pd.concat([self.df, update_df]) - log.debug(f">>> tot {time() - t2:.2f}.") - # word to identifier w2i = self.reverse_dict_many_to_one(i2w) self.word_to_identifier = update_dict(self.word_to_identifier, w2i) - # word to q-gram w2q = self.list_to_q_grams(w2i.keys()) self.word_to_q_grams = update_dict(self.word_to_q_grams, w2q) - # q-gram to word q2w = self.reverse_dict_many_to_one(w2q) self.q_gram_to_word = update_dict(self.q_gram_to_word, q2w) @@ -170,6 +162,38 @@ def text_to_positional_q_gram(self, text: str) -> list: return [text] return list(text[i:i + q] for i in range(n - q + 1)) + def df_clean_worker(self, df): + """Clean the text in query_col.""" + df["query_col"] = df["query_col"].apply(self.clean_text) + return df + + def df_clean(self, df): + """Clean the text in query_col. + + apply multi-processing when the computer is able and its relevant + """ + def chunk_dataframe(df: pd.DataFrame, chunk_size: int): + """Split DataFrame into chunks of specified size.""" + return [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)] + + max_cores = max(1, mp.cpu_count() - 1) # leave at least 1 core for other processes + min_chunk_size = 2500 + if max_cores > 1 and len(df) > min_chunk_size * 2: + for i in range(max_cores, 0, -1): + chunk_size = int(math.ceil(len(df) / i)) + if chunk_size >= min_chunk_size: + break + use_cores = i + else: + use_cores = 1 + if use_cores == 1: + return self.df_clean_worker(df) + + chunks = chunk_dataframe(df, chunk_size) + with mp.Pool(processes=use_cores) as pool: + results = pool.starmap(self.df_clean_worker, [(chunk,) for chunk in chunks]) + return pd.concat(results) + def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: """Return a dict of {identifier: word} for df.""" @@ -178,7 +202,7 @@ def words_in_df(self, df: pd.DataFrame = None) -> tuple[dict, pd.DataFrame]: # assemble query_col df["query_col"] = df.iloc[:, self.searchable_columns].astype(str).agg(" | ".join, axis=1) # clean all text at once using vectorized operations - df["query_col"] = df["query_col"].apply(self.clean_text) + df["query_col"] = self.df_clean(df.loc[:, ["query_col"]]) # build the identifier_word_dict dictionary identifier_word_dict = df["query_col"].apply(lambda text: Counter(text.split(" "))).to_dict() return identifier_word_dict, df From 169a7cbe30690c004f179e76f50b9eb8dae5ea66 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 9 Sep 2025 17:12:39 +0200 Subject: [PATCH 41/47] Fix bug with incorrect text length settings --- activity_browser/ui/widgets/line_edit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 7095a5f88..9414fa878 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -264,7 +264,7 @@ def sanitize_input(self): self.clear() self.insertPlainText(clean_text) self.blockSignals(False) - cursor.setPosition(min(position, len(text))) + cursor.setPosition(min(position, len(clean_text))) self.setTextCursor(cursor) known_words = set() @@ -317,6 +317,7 @@ def _set_items(self): current_word = text[start:end] if not current_word: self.model.setStringList([]) + self.popup.close() return if self.auto_complete_word == current_word: # avoid unnecessary auto_complete calls if the current word didnt change From 7efab029f392ec3fa81f8207b03162ad27048080 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 9 Sep 2025 18:25:16 +0200 Subject: [PATCH 42/47] Fix to allow testing of metadatastore --- activity_browser/bwutils/metadata.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/activity_browser/bwutils/metadata.py b/activity_browser/bwutils/metadata.py index 32afb629b..6f96814fa 100644 --- a/activity_browser/bwutils/metadata.py +++ b/activity_browser/bwutils/metadata.py @@ -89,6 +89,8 @@ def on_node_deleted(self, ds): pass def remove_identifier_from_search_engine(self, ds): + if not hasattr(self, "search_engine"): + return data = model_to_dict(ds) identifier = data["id"] if identifier in self.search_engine.database_id_manager(data["database"]): @@ -96,6 +98,8 @@ def remove_identifier_from_search_engine(self, ds): self.search_engine.reset_database_id_manager() def remove_identifiers_from_search_engine(self, identifiers): + if not hasattr(self, "search_engine"): + return t = time() for identifier in identifiers: self.search_engine.remove_identifier(identifier, logging=False) @@ -132,12 +136,16 @@ def on_node_changed(self, new, old): self.thread().eventDispatcher().awake.connect(self._emitSyncLater, Qt.ConnectionType.UniqueConnection) def add_identifier_to_search_engine(self, data: pd.DataFrame): + if not hasattr(self, "search_engine"): + return search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist)) # intersection becomes columns data = data[search_engine_cols] self.search_engine.add_identifier(data.copy()) self.search_engine.reset_database_id_manager() def change_identifier_in_search_engine(self, identifier, data: pd.DataFrame): + if not hasattr(self, "search_engine"): + return search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist)) # intersection becomes columns data = data[search_engine_cols] self.search_engine.change_identifier(identifier=identifier, data=data.copy()) From 06747b839c94ac78a11c0d847a028c5c9562c92f Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 12 Sep 2025 17:02:08 +0200 Subject: [PATCH 43/47] Refactor textedit to proper location --- .../layouts/panes/database_products.py | 2 +- activity_browser/ui/widgets/line_edit.py | 260 +----------------- activity_browser/ui/widgets/text_edit.py | 251 +++++++++++++++++ 3 files changed, 254 insertions(+), 259 deletions(-) create mode 100644 activity_browser/ui/widgets/text_edit.py diff --git a/activity_browser/layouts/panes/database_products.py b/activity_browser/layouts/panes/database_products.py index 86228490a..475824266 100644 --- a/activity_browser/layouts/panes/database_products.py +++ b/activity_browser/layouts/panes/database_products.py @@ -60,7 +60,7 @@ def __init__(self, parent, db_name: str): self.model.has_external_search = True self.model.external_col_name = db_name - self.search = widgets.MetaDataAutoCompleteLineEdit(self) + self.search = widgets.MetaDataAutoCompleteTextEdit(self) self.search.database_name = db_name self.search.setMaximumHeight(30) self.search.setPlaceholderText("Quick Search") diff --git a/activity_browser/ui/widgets/line_edit.py b/activity_browser/ui/widgets/line_edit.py index 9414fa878..427663938 100644 --- a/activity_browser/ui/widgets/line_edit.py +++ b/activity_browser/ui/widgets/line_edit.py @@ -1,9 +1,6 @@ from qtpy import QtWidgets -from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance, QStringListModel, Qt -from qtpy.QtGui import QTextFormat, QSyntaxHighlighter, QTextCharFormat, QTextDocument, QTextCursor -from qtpy.QtWidgets import QCompleter - -from activity_browser.bwutils import AB_metadata +from qtpy.QtCore import QTimer, Slot, Signal, SignalInstance +from qtpy.QtGui import QTextFormat class ABLineEdit(QtWidgets.QLineEdit): @@ -113,256 +110,3 @@ def focusOutEvent(self, event): self._before = after actions.ActivityModify.run(self._key, self._field, after) super(SignalledComboEdit, self).focusOutEvent(event) - - -class AutoCompleteLineEdit(QtWidgets.QLineEdit): - """Line Edit with a completer attached""" - - def __init__(self, items: list[str], parent=None): - super().__init__(parent=parent) - completer = QCompleter(items, self) - self.setCompleter(completer) - - -class MetaDataAutoCompleteLineEdit(ABLineEdit): - """Line Edit with MetaDataStore completer attached""" - - def __init__(self, parent=None): - super().__init__(parent=parent) - self.database_name = "" - - # autocompleter settings - self.model = QStringListModel() - self.completer = QCompleter(self.model) - self.popup = self.completer.popup() - self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) - self.completer.setPopup(self.popup) - # allow all items in popup list - self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) - self.setCompleter(self.completer) - - # connect textEdited, this only triggers on user input, not Completer input - self.textEdited.connect(self._set_items) - - def _set_items(self, text=None): - if text is None: - text = self.text() - - # find the start and end of the word under the cursor - cursor_pos = self.cursorPosition() - start = cursor_pos - while start > 0 and text[start - 1] != " ": - start -= 1 - end = cursor_pos - while end < len(text) and text[end] != " ": - end += 1 - current_word = text[start:end] - if not current_word: - self.model.setStringList([]) - return - context = set((text[:start] + text[end:]).split(" ")) - - # get suggestions for the current word - alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name) - alternatives = alternatives[:6] # at most 6, though we should get ~3 usually - # replace the current word with each alternative - items = [] - for alt in alternatives: - new_text = text[:start] + alt + text[end:] - items.append(new_text) - print(text, items) - - self.model.setStringList(items) - # set correct height now that we have data - max_height = max( - 20, - self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth() - ) - self.popup.setMaximumHeight(max_height) - - -class UnknownWordHighlighter(QSyntaxHighlighter): - def __init__(self, parent: QTextDocument, known_words: set): - super().__init__(parent) - self.known_words = known_words - - # define the format for unknown words - self.unknown_format = QTextCharFormat() - self.unknown_format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline) - self.unknown_format.setUnderlineColor(Qt.red) - - def highlightBlock(self, text: str): - if text.startswith("="): - return - words = text.split() - index = 0 - for word in words: - word_len = len(word) - if word and word not in self.known_words: - self.setFormat(index, word_len, self.unknown_format) - index += word_len + 1 # +1 for the space - - -class ABTextEdit(QtWidgets.QTextEdit): - textChangedDebounce: SignalInstance = Signal(str) - _debounce_ms = 250 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self._debounce_timer = QTimer(self, singleShot=True) - - self.textChanged.connect(self._set_debounce) - self._debounce_timer.timeout.connect(self._emit_debounce) - - def _set_debounce(self): - self._debounce_timer.setInterval(self._debounce_ms) - self._debounce_timer.start() - - def _emit_debounce(self): - self.textChangedDebounce.emit(self.toPlainText()) - - def debounce(self): - return self._debounce_ms - - def setDebounce(self, ms: int): - self._debounce_ms = ms - - -class MetaDataAutoCompleteLineEdit(ABTextEdit): - """Line Edit with MetaDataStore completer attached""" - - def __init__(self, parent=None): - super().__init__(parent=parent) - self.database_name = "" - self.auto_complete_word = "" - - # autocompleter settings - self.model = QStringListModel() - self.completer = QCompleter(self.model) - self.completer.setWidget(self) - self.popup = self.completer.popup() - self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) - self.completer.setPopup(self.popup) - # allow all items in popup list - self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) - self.completer.activated.connect(self._insert_auto_complete) - - self.textChanged.connect(self.sanitize_input) - self.highlighter = UnknownWordHighlighter(self.document(), set()) - self.cursorPositionChanged.connect(self._set_items) - - def sanitize_input(self): - self._debounce_timer.stop() - text = self.toPlainText() - clean_text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text) - - if clean_text != text: - cursor = self.textCursor() - position = cursor.position() - self.blockSignals(True) - self.clear() - self.insertPlainText(clean_text) - self.blockSignals(False) - cursor.setPosition(min(position, len(clean_text))) - self.setTextCursor(cursor) - - known_words = set() - for identifier in AB_metadata.search_engine.database_id_manager(self.database_name): - known_words.update(AB_metadata.search_engine.identifier_to_word[identifier].keys()) - self.highlighter.known_words = known_words - - if len(text) == 0: - self.popup.close() - self._set_debounce() - - def _insert_auto_complete(self, completion): - cursor = self.textCursor() - position = cursor.position() - text = self.toPlainText() - - start = position - while start > 0 and text[start - 1] != " ": - start -= 1 - new_position = start + len(completion) + 1 - - # select the word under the cursor - cursor.select(QTextCursor.WordUnderCursor) - # replace it with the completion - cursor.insertText(completion + " ") - # set the updated cursor to end of inserted word + space - cursor.setPosition(min(new_position, len(text[:start] + completion) + 1)) - self.setTextCursor(cursor) - - self.popup.close() - self.auto_complete_word = "" - self.model.setStringList([]) - - def _set_items(self): - text = self.toPlainText() - if text.startswith("="): - self.model.setStringList([]) - self.auto_complete_word = "" - self.popup.close() - return - - # find the start and end of the word under the cursor - cursor_pos = self.textCursor().position() - start = cursor_pos - while start > 0 and text[start - 1] != " ": - start -= 1 - end = cursor_pos - while end < len(text) and text[end] != " ": - end += 1 - current_word = text[start:end] - if not current_word: - self.model.setStringList([]) - self.popup.close() - return - if self.auto_complete_word == current_word: - # avoid unnecessary auto_complete calls if the current word didnt change - return - self.auto_complete_word = current_word - - context = set((text[:start] + text[end:]).split(" ")) - # get suggestions for the current word - alternatives = AB_metadata.auto_complete(current_word, context=context, database=self.database_name) - alternatives = alternatives[:6] # at most 6, though we should get ~3 usually - # replace the current word with each alternative - items = [] - for alt in alternatives: - new_text = text[:start] + alt + text[end:] - # items.append(new_text) - items.append(alt) - print(cursor_pos, text, items) - if len(items) == 0: - self.popup.close() - return - - self.model.setStringList(items) - # set correct height now that we have data - max_height = max( - 20, - self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth() - ) - self.popup.setMaximumHeight(max_height) - self.completer.complete() - - def keyPressEvent(self, event): - key = event.key() - - if key in (Qt.Key_Enter, Qt.Key_Return, Qt.Key_Tab): - # insert an autocomplete item - # capture enter/return/tab key - index = self.popup.currentIndex() - completion_text = index.data(Qt.DisplayRole) - self.completer.activated.emit(completion_text) - return - elif key in (Qt.Key_Space,): - self.popup.close() - - super().keyPressEvent(event) - - # trigger on text input keys - if event.text() or key in (Qt.LeftArrow, Qt.RightArrow): # filters out non-text keys like arrows, shift, etc. - self._set_items() diff --git a/activity_browser/ui/widgets/text_edit.py b/activity_browser/ui/widgets/text_edit.py new file mode 100644 index 000000000..aff4344ae --- /dev/null +++ b/activity_browser/ui/widgets/text_edit.py @@ -0,0 +1,251 @@ +from qtpy import QtWidgets +from qtpy.QtCore import QTimer, Signal, SignalInstance, QStringListModel, Qt +from qtpy.QtGui import QSyntaxHighlighter, QTextCharFormat, QTextDocument, QFont +from qtpy.QtWidgets import QCompleter, QStyledItemDelegate, QStyle + +from activity_browser.bwutils import AB_metadata + + +class UnknownWordHighlighter(QSyntaxHighlighter): + def __init__(self, parent: QTextDocument, known_words: set): + super().__init__(parent) + self.known_words = known_words + + # define the format for unknown words + self.unknown_format = QTextCharFormat() + self.unknown_format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline) + self.unknown_format.setUnderlineColor(Qt.red) + + def highlightBlock(self, text: str): + if text.startswith("="): + return + words = text.split() + index = 0 + for word in words: + word_len = len(word) + if word and word not in self.known_words: + self.setFormat(index, word_len, self.unknown_format) + index += word_len + 1 # +1 for the space + + +class AutoCompleteDelegate(QStyledItemDelegate): + def __init__(self, parent=None, get_bold_word_func=None): + super().__init__(parent) + self.get_bold_word_func = get_bold_word_func + + def paint(self, painter, option, index): + text = index.data(Qt.DisplayRole) + bold_words = self.get_bold_word_func() + bold_words = {word.lower() for word in bold_words} + + painter.save() + + # Draw selection background if selected + if option.state & QStyle.State_Selected: + painter.fillRect(option.rect, option.palette.highlight()) + painter.setPen(option.palette.highlightedText().color()) + else: + painter.setPen(option.palette.text().color()) + + # Split text into words and draw each with appropriate font + words = text.split(" ") + x = option.rect.x() + y = option.rect.y() + spacing = 4 # space between words + font = option.font + metrics = painter.fontMetrics() + + for word in words: + word_font = QFont(font) + if word.lower() in bold_words: + word_font.setBold(True) + painter.setFont(word_font) + + word_width = metrics.horizontalAdvance(word) + painter.drawText(x, y + metrics.ascent() + (option.rect.height() - metrics.height()) // 2, word) + x += word_width + spacing + painter.restore() + + +class ABTextEdit(QtWidgets.QTextEdit): + textChangedDebounce: SignalInstance = Signal(str) + _debounce_ms = 250 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._debounce_timer = QTimer(self, singleShot=True) + + self.textChanged.connect(self._set_debounce) + self._debounce_timer.timeout.connect(self._emit_debounce) + + def _set_debounce(self): + self._debounce_timer.setInterval(self._debounce_ms) + self._debounce_timer.start() + + def _emit_debounce(self): + self.textChangedDebounce.emit(self.toPlainText()) + + def debounce(self): + return self._debounce_ms + + def setDebounce(self, ms: int): + self._debounce_ms = ms + + +class ABAutoCompleTextEdit(ABTextEdit): + def __init__(self, parent=None, highlight_unknown=False): + super().__init__(parent=parent) + self.auto_complete_word = "" + self.auto_complete_suggestions = [] + + # autocompleter settings + self.model = QStringListModel() + self.completer = QCompleter(self.model) + self.completer.setWidget(self) + self.popup = self.completer.popup() + # set custom delegate to bold the current word + delegate = AutoCompleteDelegate(self.popup, get_bold_word_func=lambda: self.auto_complete_suggestions) + self.popup.setItemDelegate(delegate) + self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) + self.completer.setPopup(self.popup) + self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) # allow all items in popup list + self.completer.activated.connect(self._insert_auto_complete) + + self.textChanged.connect(self._sanitize_input) + if highlight_unknown: + self.highlighter = UnknownWordHighlighter(self.document(), set()) + self.cursorPositionChanged.connect(self._set_autocomplete_items) + + def keyPressEvent(self, event): + key = event.key() + + if key in (Qt.Key_Enter, Qt.Key_Return, Qt.Key_Tab): + # insert an autocomplete item + # capture enter/return/tab key + index = self.popup.currentIndex() + completion_text = index.data(Qt.DisplayRole) + self.completer.activated.emit(completion_text) + return + elif key in (Qt.Key_Space,): + self.popup.close() + + super().keyPressEvent(event) + + # trigger on text input keys + if event.text() or key in (Qt.LeftArrow, Qt.RightArrow): # filters out non-text keys except l/r arrows + self._set_autocomplete_items() + + def _sanitize_input(self): + raise NotImplementedError + + def _set_autocomplete_items(self): + raise NotImplementedError + + def _insert_auto_complete(self, completion): + cursor = self.textCursor() + position = cursor.position() + completion = completion + " " # add space to end of new text + + # find where to put cursor back + new_position = position + while new_position < len(completion) and completion[new_position] != " ": + new_position += 1 + new_position += 1 # add one char for space + + # set new text from completion + self.blockSignals(True) + self.clear() + self.setText(completion) + # set the cursor location + cursor.setPosition(min(new_position, len(completion))) + self.setTextCursor(cursor) + self.blockSignals(False) + + # house keeping + self._emit_debounce() + self.popup.close() + self.auto_complete_word = "" + self.model.setStringList([]) + + +class MetaDataAutoCompleteTextEdit(ABAutoCompleTextEdit): + """TextEdit with MetaDataStore completer attached.""" + def __init__(self, parent=None): + super().__init__(parent=parent, highlight_unknown=True) + self.database_name = "" + + def _sanitize_input(self): + self._debounce_timer.stop() + text = self.toPlainText() + clean_text = AB_metadata.search_engine.ONE_SPACE_PATTERN.sub(" ", text) + + if clean_text != text: + cursor = self.textCursor() + position = cursor.position() + self.blockSignals(True) + self.clear() + self.insertPlainText(clean_text) + self.blockSignals(False) + cursor.setPosition(min(position, len(clean_text))) + self.setTextCursor(cursor) + + known_words = set() + for identifier in AB_metadata.search_engine.database_id_manager(self.database_name): + known_words.update(AB_metadata.search_engine.identifier_to_word[identifier].keys()) + self.highlighter.known_words = known_words + + if len(text) == 0: + self.popup.close() + self._set_debounce() + + def _set_autocomplete_items(self): + text = self.toPlainText() + if text.startswith("="): + self.model.setStringList([]) + self.auto_complete_word = "" + self.popup.close() + return + + # find the start and end of the word under the cursor + cursor = self.textCursor() + position = cursor.position() + start = position + while start > 0 and text[start - 1] != " ": + start -= 1 + end = position + while end < len(text) and text[end] != " ": + end += 1 + current_word = text[start:end] + if not current_word: + self.model.setStringList([]) + self.popup.close() + self.auto_complete_word = "" + return + if self.auto_complete_word == current_word: + # avoid unnecessary auto_complete calls if the current word didnt change + return + self.auto_complete_word = current_word + + context = set((text[:start] + text[end:]).split(" ")) + # get suggestions for the current word + suggestions = AB_metadata.auto_complete(current_word, context=context, database=self.database_name) + suggestions = suggestions[:6] # at most 6, though we should get ~3 usually + self.auto_complete_suggestions = suggestions # set for bolding of autocomplete suggestions + # replace the current word with each alternative + items = [] + for alt in suggestions: + new_text = text[:start] + alt + text[end:] + items.append(new_text) + if len(items) == 0: + self.popup.close() + return + + self.model.setStringList(items) + # set correct height now that we have data + max_height = max( + 20, + self.popup.sizeHintForRow(0) * 3 + 2 * self.popup.frameWidth() + ) + self.popup.setMaximumHeight(max_height) + self.completer.complete() From 90583c668e5fcbdcfb1e357cf6ae13a5673352ac Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Fri, 12 Sep 2025 17:02:11 +0200 Subject: [PATCH 44/47] Refactor textedit to proper location --- activity_browser/ui/widgets/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/activity_browser/ui/widgets/__init__.py b/activity_browser/ui/widgets/__init__.py index 333811439..89d2c30ca 100644 --- a/activity_browser/ui/widgets/__init__.py +++ b/activity_browser/ui/widgets/__init__.py @@ -1,8 +1,8 @@ from .abstract_pane import ABAbstractPane from .comparison_switch import SwitchComboBox from .cutoff_menu import CutoffMenu -from .line_edit import (ABLineEdit, SignalledComboEdit, SignalledLineEdit, - SignalledPlainTextEdit, MetaDataAutoCompleteLineEdit) +from .line_edit import ABLineEdit, SignalledComboEdit, SignalledLineEdit, SignalledPlainTextEdit +from .text_edit import MetaDataAutoCompleteTextEdit from .treeview import ABTreeView from .item_model import ABItemModel from .item import ABAbstractItem, ABBranchItem, ABDataItem From fecbcf20cb8f4c6bf5f2c267b9fdb20b2ef68ab6 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 16 Sep 2025 11:34:33 +0200 Subject: [PATCH 45/47] Implement search caching for faster results --- .../bwutils/searchengine/metadata_search.py | 143 +++++++++++++++--- 1 file changed, 123 insertions(+), 20 deletions(-) diff --git a/activity_browser/bwutils/searchengine/metadata_search.py b/activity_browser/bwutils/searchengine/metadata_search.py index 374ca56e0..1814a3e8a 100644 --- a/activity_browser/bwutils/searchengine/metadata_search.py +++ b/activity_browser/bwutils/searchengine/metadata_search.py @@ -12,17 +12,22 @@ class MetaDataSearchEngine(SearchEngine): + + # caching for faster operation def database_id_manager(self, database): if not hasattr(self, "all_database_ids"): self.all_database_ids = {} if database_ids := self.all_database_ids.get(database): self.database_ids = database_ids + self.current_database = database elif database is not None: self.database_ids = set(self.df[self.df["database"] == database].index.to_list()) self.all_database_ids[database] = self.database_ids + self.current_database = database else: self.database_ids = None + self.current_database = "_@@NO_DB_" return self.database_ids def reset_database_id_manager(self): @@ -31,10 +36,54 @@ def reset_database_id_manager(self): if hasattr(self, "database_ids"): del self.database_ids - def add_identifier(self, data: pd.DataFrame) -> None: - super().add_identifier(data) + def database_word_manager(self, database): + if not hasattr(self, "all_database_words"): + self.all_database_words = {} + + if database_words := self.all_database_words.get(database): + self.database_words = database_words + elif database is not None: + ids = self.database_id_manager(database) + self.database_words = self.reverse_dict_many_to_one({_id: self.identifier_to_word[_id] for _id in ids}) + self.all_database_words[database] = self.database_words + else: + self.database_words = None + return self.database_words + + def reset_database_word_manager(self, database): + if hasattr(self, "all_database_words") and self.all_database_words.get(database): + del self.all_database_words[database] + if hasattr(self, "database_words"): + del self.database_words + + def database_search_cache(self, database, query, result = None): + if not hasattr(self, "search_cache"): + self.search_cache = {} + + if result: + if self.search_cache.get(database): + self.search_cache[database][query] = result + else: + self.search_cache[database] = {query: result} + return + if db_cache := self.search_cache.get(database): + if cached_result := db_cache.get(query): + return cached_result + return + + def reset_search_cache(self, database): + if hasattr(self, "search_cache") and self.search_cache.get(database): + del self.search_cache[database] + + def reset_all_caches(self, databases): self.reset_database_id_manager() + for database in databases: + self.reset_database_word_manager(database) + self.reset_search_cache(database) + def add_identifier(self, data: pd.DataFrame) -> None: + super().add_identifier(data) + self.reset_all_caches(data["database"].unique()) def remove_identifiers(self, identifiers, logging=True) -> None: t = time() @@ -42,6 +91,7 @@ def remove_identifiers(self, identifiers, logging=True) -> None: identifiers = set(identifiers) current_identifiers = set(self.df.index.to_list()) identifiers = identifiers | current_identifiers # only remove identifiers currently in the data + databases = self.df.loc[identifiers, ["databases"]].unique() # extract databases for cache cleaning if len(identifiers) == 0: return @@ -51,11 +101,11 @@ def remove_identifiers(self, identifiers, logging=True) -> None: if logging: log.debug(f"Search index updated in {time() - t:.2f} seconds " f"for {len(identifiers)} removed items ({len(self.df)} items ({self.size_of_index()}) currently).") - self.reset_database_id_manager() + self.reset_all_caches(databases) def change_identifier(self, identifier, data: pd.DataFrame) -> None: super().change_identifier(identifier, data) - self.reset_database_id_manager() + self.reset_all_caches(data["database"].unique()) def auto_complete(self, word: str, context: Optional[set] = set(), database: Optional[str] = None) -> list: """Based on spellchecker, make more useful for autocompletions @@ -188,6 +238,53 @@ def find_q_gram_matches(self, q_grams: set, return_all: bool = False) -> pd.Data return matches.iloc[:min(len(matches), 2500), :] # return at most this many results + def search_size_1(self, queries: list, original_words: set, orig_word_weight=5, exact_word_weight=1) -> dict: + """Return a dict of {query_word: Counter(identifier)}. + + queries: is a list of len 1 tuple/lists of words that are a searched word or a 'spell checked' similar word + original words: a list of words actually searched for (not including spellchecked) + + orig_word_weight: additional weight to add to original words + exact_word_weight: additional weight to add to exact word matches (as opposed to be 'in' str) + + First, we find all matching words, creating a dict of words in 'queries' as keys and words matching that query word as list of values + Next, we convert this to identifiers and add weights: + Weight will be increased if matching 'orig_word_weight' or 'exact_word_weight' + """ + matches = {} + t2 = time() + # add each word in search index if query_word in word + for word in self.database_words.keys(): + for query in queries: + # query is list/tuple of len 1 + query_word = query[0] # only use the word + if query_word in word: + words = matches.get(query_word, []) + words.extend([word]) + matches[query_word] = words + + # now convert matched words to matched identifiers + matched_identifiers = {} + for word, matching_words in matches.items(): + if result := self.database_search_cache(self.current_database, word): + matched_identifiers[word] = result + continue + id_counter = matched_identifiers.get(word, Counter()) + for matched_word in matching_words: + weight = self.base_weight + + # add the word n times, where n is the weight, original search word is weighted higher than alternatives + if matched_word in original_words: + weight += orig_word_weight # increase weight for original word + if matched_word == word: + weight += exact_word_weight # increase weight for exact matching word + + id_counter = self.weigh_identifiers(self.database_words[matched_word], weight, id_counter) + matched_identifiers[word] = id_counter + self.database_search_cache(self.current_database, word, matched_identifiers[word]) + + return matched_identifiers + def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True) -> list: """Overwritten for extra database specific reduction of results. """ @@ -200,6 +297,7 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter # DATABASE SPECIFIC get the set of ids that is in this database self.database_id_manager(database) + self.database_word_manager(database) queries = self.build_queries(text) @@ -279,17 +377,21 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter # now search for all permutations of this query combined with a space query_df = search_df[search_df[self.identifier_name].isin(query_identifiers)] for query_perm in permutations(query): - mask = self.filter_dataframe(query_df, " ".join(query_perm), search_columns=["query_col"]) - new_df = query_df.loc[mask].reset_index(drop=True) - if len(new_df) == 0: - # there is no match for this permutation of words, skip - continue - new_id_list = new_df[self.identifier_name] - - new_ids = Counter() - for new_id in new_id_list: - new_ids[new_id] = query_identifiers[new_id] - + query_perm_str = " ".join(query_perm) + if result := self.database_search_cache(self.current_database, query_perm_str): + new_ids = result + else: + mask = self.filter_dataframe(query_df, query_perm_str, search_columns=["query_col"]) + new_df = query_df.loc[mask].reset_index(drop=True) + if len(new_df) == 0: + # there is no match for this permutation of words, skip + continue + new_id_list = new_df[self.identifier_name] + + new_ids = Counter() + for new_id in new_id_list: + new_ids[new_id] = query_identifiers[new_id] + self.database_search_cache(self.current_database, query_perm_str, new_ids) # we weigh a combination of words that is next also to each other even higher than just the words separately query_to_identifier[query_name] = self.weigh_identifiers(new_ids, weight, query_to_identifier[query_name]) @@ -298,14 +400,15 @@ def fuzzy_search(self, text: str, database: Optional[str] = None, return_counter for identifiers in query_to_identifier.values(): all_identifiers += identifiers + if return_counter: + return_this = all_identifiers + else: + # now sort on highest weights and make list type + return_this = [identifier[0] for identifier in all_identifiers.most_common()] if logging: log.debug( f"Found {len(all_identifiers)} search results for '{text}' in {len(self.df)} items in {time() - t:.2f} seconds") - if return_counter: - return all_identifiers - # now sort on highest weights and make list type - sorted_identifiers = [identifier[0] for identifier in all_identifiers.most_common()] - return sorted_identifiers + return return_this def search(self, text, database: Optional[str] = None) -> list: """Search the dataframe on this text, return a sorted list of identifiers.""" From 9734ad2c467b549b87caeb1e23b4e768c8750e66 Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 16 Sep 2025 11:55:52 +0200 Subject: [PATCH 46/47] bold only current word, not all search suggested words --- activity_browser/ui/widgets/text_edit.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/activity_browser/ui/widgets/text_edit.py b/activity_browser/ui/widgets/text_edit.py index aff4344ae..9daf4fabe 100644 --- a/activity_browser/ui/widgets/text_edit.py +++ b/activity_browser/ui/widgets/text_edit.py @@ -29,14 +29,12 @@ def highlightBlock(self, text: str): class AutoCompleteDelegate(QStyledItemDelegate): - def __init__(self, parent=None, get_bold_word_func=None): + def __init__(self, parent=None): super().__init__(parent) - self.get_bold_word_func = get_bold_word_func + self.current_word_index = -1 def paint(self, painter, option, index): text = index.data(Qt.DisplayRole) - bold_words = self.get_bold_word_func() - bold_words = {word.lower() for word in bold_words} painter.save() @@ -55,9 +53,9 @@ def paint(self, painter, option, index): font = option.font metrics = painter.fontMetrics() - for word in words: + for i, word in enumerate(words): word_font = QFont(font) - if word.lower() in bold_words: + if i+1 == self.current_word_index: word_font.setBold(True) painter.setFont(word_font) @@ -97,16 +95,14 @@ class ABAutoCompleTextEdit(ABTextEdit): def __init__(self, parent=None, highlight_unknown=False): super().__init__(parent=parent) self.auto_complete_word = "" - self.auto_complete_suggestions = [] # autocompleter settings self.model = QStringListModel() self.completer = QCompleter(self.model) self.completer.setWidget(self) self.popup = self.completer.popup() - # set custom delegate to bold the current word - delegate = AutoCompleteDelegate(self.popup, get_bold_word_func=lambda: self.auto_complete_suggestions) - self.popup.setItemDelegate(delegate) + self.delegate = AutoCompleteDelegate(self.popup) # set custom delegate to bold the current word + self.popup.setItemDelegate(self.delegate) self.popup.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) self.completer.setPopup(self.popup) self.completer.setCompletionMode(QCompleter.UnfilteredPopupCompletion) # allow all items in popup list @@ -228,10 +224,10 @@ def _set_autocomplete_items(self): self.auto_complete_word = current_word context = set((text[:start] + text[end:]).split(" ")) + self.delegate.current_word_index = len(text[:start].split(" ")) # current word index for bolding # get suggestions for the current word suggestions = AB_metadata.auto_complete(current_word, context=context, database=self.database_name) suggestions = suggestions[:6] # at most 6, though we should get ~3 usually - self.auto_complete_suggestions = suggestions # set for bolding of autocomplete suggestions # replace the current word with each alternative items = [] for alt in suggestions: From e342f2247f82f0980ea912b5ae604f9ac59ea29b Mon Sep 17 00:00:00 2001 From: marc-vdm Date: Tue, 16 Sep 2025 12:51:42 +0200 Subject: [PATCH 47/47] enable dealing with empty metadata in tests --- activity_browser/bwutils/searchengine/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/activity_browser/bwutils/searchengine/base.py b/activity_browser/bwutils/searchengine/base.py index 5a7752e3a..5b9127985 100644 --- a/activity_browser/bwutils/searchengine/base.py +++ b/activity_browser/bwutils/searchengine/base.py @@ -114,6 +114,9 @@ def update_dict(update_me: dict, new: dict) -> dict: update_me = update_me | new_data return update_me + if len(update_df) == 0: + return + t = time() size_old = len(self.df) # identifier to word and df