Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
fa68328
search class
marc-vdm Jul 21, 2025
40bf407
Include correct init file as well
marc-vdm Jul 22, 2025
413ac1a
Search tests, minor corrections, better documentation
marc-vdm Jul 23, 2025
556908c
Merge branch 'beta' into major_search
marc-vdm Jul 23, 2025
d01387f
Improve search speed with many results.
marc-vdm Jul 24, 2025
295995e
Add basic logging to SearchEngine
marc-vdm Jul 24, 2025
30958c7
Merge remote-tracking branch 'upstream/major' into major_search
marc-vdm Jul 24, 2025
79754ca
.
marc-vdm Jul 24, 2025
6bd39f7
Base implementation of metadata specific class
marc-vdm Jul 28, 2025
91a3328
minor changes to searchengine
marc-vdm Aug 18, 2025
e4c2005
Merge branch 'major' into major_search
marc-vdm Sep 2, 2025
8ec0cf4
- Solve bug in OSA distance for early stopping with long similar strings
marc-vdm Sep 2, 2025
e2bb1cf
update add/change identifier (and tests) to accept dataframes instead…
marc-vdm Sep 3, 2025
2d6ca0f
update add/change identifier (and tests) to accept dataframes instead…
marc-vdm Sep 3, 2025
04053ab
move searchengine.py to bwutils instead of subfolder
marc-vdm Sep 3, 2025
478ed5d
move searchengine.py to bwutils instead of subfolder
marc-vdm Sep 3, 2025
1c13007
move searchengine files
marc-vdm Sep 3, 2025
646b3be
move searchengine files
marc-vdm Sep 3, 2025
39af763
metadata and search size logging
marc-vdm Sep 3, 2025
fad8a06
- Faster results with large data and short queries
marc-vdm Sep 3, 2025
1aad95b
Base implementation of better search in ActivitiesProducts table
marc-vdm Sep 3, 2025
f14d43b
Merge branch 'major' into major_search
marc-vdm Sep 4, 2025
5b0a965
check all newly added items are unique
marc-vdm Sep 4, 2025
9ee3450
dont allow sorting of table when search engine in use
marc-vdm Sep 4, 2025
e92d298
resolve search bug with multiple typos not working
marc-vdm Sep 4, 2025
83ae162
First version of autocomplete
marc-vdm Sep 4, 2025
2b61e16
cache database identifiers for faster results + much faster autocomplete
marc-vdm Sep 4, 2025
64bbcd1
Implement proper autocomplete popup
marc-vdm Sep 4, 2025
e76f57c
suggestions for currently edited word instead of last word + better a…
marc-vdm Sep 5, 2025
eeed992
Improve text cleaning regex + autocomplete deals better with key hash…
marc-vdm Sep 5, 2025
bba71c7
better key hash sorting
marc-vdm Sep 5, 2025
8e73436
better autocomplete performance when many long qgram matches
marc-vdm Sep 5, 2025
2f07859
resolve bug with removing identifier from searchengine leading to bre…
marc-vdm Sep 5, 2025
6e5d1cb
add functionality for adding, changing and removing identifiers (exce…
marc-vdm Sep 5, 2025
0bd672c
add functionality for adding and removing full databases
marc-vdm Sep 6, 2025
4791c56
improve matching speed after metadata conversion to ProductModel
marc-vdm Sep 7, 2025
532cac2
make autocomplete suggestions aware of context of other words in quer…
marc-vdm Sep 7, 2025
42c3593
ProductModel suggestions now include literal matches better
marc-vdm Sep 7, 2025
4ec98fb
Update line-edit autocompleter base class
marc-vdm Sep 8, 2025
72e01d1
Add marking of unknown words to search
marc-vdm Sep 9, 2025
fbeb455
drop literal search results
marc-vdm Sep 9, 2025
59e8e18
marginal speed increases for initializing/updating for base class
marc-vdm Sep 9, 2025
e04c20e
marginal speed increases for initializing/updating for base class
marc-vdm Sep 9, 2025
1bedc53
Implement multiprocessing to increase speed for text cleaning during …
marc-vdm Sep 9, 2025
169a7cb
Fix bug with incorrect text length settings
marc-vdm Sep 9, 2025
fe10a1d
Merge branch 'major' into major_search
marc-vdm Sep 9, 2025
7efab02
Fix to allow testing of metadatastore
marc-vdm Sep 9, 2025
06747b8
Refactor textedit to proper location
marc-vdm Sep 12, 2025
90583c6
Refactor textedit to proper location
marc-vdm Sep 12, 2025
fecbcf2
Implement search caching for faster results
marc-vdm Sep 16, 2025
3c3bc6d
Merge branch 'major' into major_search
marc-vdm Sep 16, 2025
9734ad2
bold only current word, not all search suggested words
marc-vdm Sep 16, 2025
e342f22
enable dealing with empty metadata in tests
marc-vdm Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions activity_browser/bwutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .montecarlo import MonteCarloLCA
from .multilca import MLCA, Contributions
from .pedigree import PedigreeMatrix
from .searchengine import SearchEngine, MetaDataSearchEngine
from .sensitivity_analysis import GlobalSensitivityAnalysis
from .superstructure import SuperstructureContributions, SuperstructureMLCA
from .uncertainty import (CFUncertaintyInterface, ExchangeUncertaintyInterface,
Expand Down
82 changes: 78 additions & 4 deletions activity_browser/bwutils/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
import itertools
import sqlite3
import pickle
import sys
from time import time
from functools import lru_cache
from typing import Set
from typing import Set, Optional
from logging import getLogger

from playhouse.shortcuts import model_to_dict

import pandas as pd

from qtpy.QtCore import Qt, QObject, Signal, SignalInstance
Expand All @@ -17,6 +17,8 @@
from bw2data.errors import UnknownObject
from bw2data.backends import sqlite3_lci_db, ActivityDataset

from activity_browser.bwutils.searchengine import MetaDataSearchEngine

from activity_browser import signals


Expand Down Expand Up @@ -65,6 +67,12 @@ def __init__(self, parent=None):
self.moveToThread(application.thread())
self.connect_signals()

self.search_engine_whitelist = [
"id", "name", "synonyms", "unit", "key", "database", # generic
"CAS number", "categories", # biosphere specific
"product", "reference product", "classifications", "location", "properties" # activity specific
]

def connect_signals(self):
signals.project.changed.connect(self.sync)
signals.node.changed.connect(self.on_node_changed)
Expand All @@ -74,11 +82,32 @@ def connect_signals(self):

def on_node_deleted(self, ds):
try:
self.dataframe.drop(ds.key, inplace=True)
self.dataframe = self.dataframe.drop(ds.key)
self.remove_identifier_from_search_engine(ds)
self.synced.emit()
except KeyError:
pass

def remove_identifier_from_search_engine(self, ds):
if not hasattr(self, "search_engine"):
return
data = model_to_dict(ds)
identifier = data["id"]
if identifier in self.search_engine.database_id_manager(data["database"]):
self.search_engine.remove_identifier(identifier)
self.search_engine.reset_database_id_manager()

def remove_identifiers_from_search_engine(self, identifiers):
if not hasattr(self, "search_engine"):
return
t = time()
for identifier in identifiers:
self.search_engine.remove_identifier(identifier, logging=False)
self.search_engine.reset_database_id_manager()
log.debug(f"Search index updated in {time() - t:.2f} seconds "
f"for {len(identifiers)} removed items "
f"({len(self.search_engine.df)} items ({self.search_engine.size_of_index()}) currently).")

def on_node_changed(self, new, old):
data_raw = model_to_dict(new)
data = data_raw.pop("data")
Expand All @@ -96,13 +125,32 @@ def on_node_changed(self, new, old):
for col in [col for col in data.columns if col not in self.dataframe.columns]:
self.dataframe[col] = pd.NA
self.dataframe.loc[new.key] = data.loc[new.key]
self.change_identifier_in_search_engine(identifier=data.loc[new.key, "id"], data=data.loc[[new.key]])
elif self.dataframe.empty: # an activity has been added and the dataframe was empty
self.dataframe = data
self.add_identifier_to_search_engine(data)
else: # an activity has been added and needs to be concatenated to existing metadata
self.dataframe = pd.concat([self.dataframe, data], join="outer")
self.add_identifier_to_search_engine(data)

self.thread().eventDispatcher().awake.connect(self._emitSyncLater, Qt.ConnectionType.UniqueConnection)

def add_identifier_to_search_engine(self, data: pd.DataFrame):
if not hasattr(self, "search_engine"):
return
search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist)) # intersection becomes columns
data = data[search_engine_cols]
self.search_engine.add_identifier(data.copy())
self.search_engine.reset_database_id_manager()

def change_identifier_in_search_engine(self, identifier, data: pd.DataFrame):
if not hasattr(self, "search_engine"):
return
search_engine_cols = list(set(data.columns) & set(self.search_engine_whitelist)) # intersection becomes columns
data = data[search_engine_cols]
self.search_engine.change_identifier(identifier=identifier, data=data.copy())
self.search_engine.reset_database_id_manager()

@property
def databases(self):
return set(self.dataframe.get("database", []))
Expand Down Expand Up @@ -154,7 +202,10 @@ def sync_databases(self) -> None:

for db_name in [x for x in self.databases if x not in bd.databases]:
# deleted databases
remove_search_engine = self.dataframe[self.dataframe["database"] == db_name]["id"]
self.dataframe.drop(db_name, level=0, inplace=True)
if len(remove_search_engine) > 0:
self.remove_identifiers_from_search_engine(remove_search_engine)
sync = True

for db_name in [x for x in bd.databases if x not in self.databases]:
Expand All @@ -167,7 +218,7 @@ def sync_databases(self) -> None:
self.dataframe = data
else:
self.dataframe = pd.concat([self.dataframe, data], join="outer")

self.add_identifier_to_search_engine(data)
sync = True

if sync:
Expand All @@ -183,6 +234,7 @@ def _get_database(self, db_name: str) -> pd.DataFrame | None:

def sync(self) -> None:
"""Deletes metadata when the project is changed."""
t = time()
log.debug("Synchronizing MetaDataStore")

con = sqlite3.connect(sqlite3_lci_db._filepath)
Expand All @@ -191,6 +243,13 @@ def sync(self) -> None:

self.dataframe = self._parse_df(node_df)

size_bytes = sys.getsizeof(self.dataframe)
if size_bytes < 1024 ** 3:
size = f"{size_bytes / (1024 ** 2):.1f} MB"
else:
size = f"{size_bytes / (1024 ** 3):.2f} GB"
log.debug(f"MetaDataStore Synchronized in {time() - t:.2f} seconds for {len(self.dataframe)} items ({size}))")
self.init_search() # init search index
self.synced.emit()

def _parse_df(self, raw_df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -343,5 +402,20 @@ def _unpacker(self, classifications: list, system: str) -> list:
system_classifications.append(result) # result is either "" or the classification
return system_classifications

def init_search(self):
self.search_engine = MetaDataSearchEngine(self.dataframe, identifier_name="id", searchable_columns=self.search_engine_whitelist)

def db_search(self, query:str, database: Optional[str] = None, return_counter: bool = False, logging: bool = True):
# we do fuzzy search as we re-index results (combining products and activities) for database_products table
# anyway, so including literal results quite literally is a waste of time at this point
return self.search_engine.fuzzy_search(query, database=database, return_counter=return_counter, logging=logging)

def search(self, query:str):
return self.search_engine.search(query)

def auto_complete(self, word:str, context: Optional[set] = None, database: Optional[str] = None):
word = self.search_engine.clean_text(word)
completions = self.search_engine.auto_complete(word, context=context, database=database)
return completions

AB_metadata = MetaDataStore()
2 changes: 2 additions & 0 deletions activity_browser/bwutils/searchengine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .base import SearchEngine
from .metadata_search import MetaDataSearchEngine
Loading
Loading