From cf770febfdc5ba624a491367019eb131394ba8ac Mon Sep 17 00:00:00 2001 From: bruntib Date: Wed, 21 Jan 2026 17:01:48 +0100 Subject: [PATCH] [db] Introduce a join table between SourceComponent and File Queries by source components are slow, because it contains a WHERE clause that performs string pattern matching on file paths. As a solution SourceComponentFile join table is introduced that stores which files match to a given source component. --- .../codechecker_server/api/mass_store_run.py | 51 ++++++- .../codechecker_server/api/report_server.py | 128 ++++++++++-------- .../database/run_db_model.py | 17 +++ ...c219_add_sourcecomponentfile_join_table.py | 64 +++++++++ .../report_viewer_api/test_report_counting.py | 24 ---- 5 files changed, 203 insertions(+), 81 deletions(-) create mode 100644 web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py diff --git a/web/server/codechecker_server/api/mass_store_run.py b/web/server/codechecker_server/api/mass_store_run.py index 6534ba0738..0c44436248 100644 --- a/web/server/codechecker_server/api/mass_store_run.py +++ b/web/server/codechecker_server/api/mass_store_run.py @@ -14,6 +14,7 @@ import base64 from collections import defaultdict from datetime import datetime, timedelta +import fnmatch from hashlib import sha256 import json import os @@ -51,7 +52,8 @@ ExtendedReportData, \ File, FileContent, \ Report as DBReport, ReportAnnotations, ReviewStatus as ReviewStatusRule, \ - Run, RunLock as DBRunLock, RunHistory + Run, RunLock as DBRunLock, RunHistory, \ + SourceComponent, SourceComponentFile from ..metadata import checker_is_unavailable, MetadataInfoParser from .report_annotations import report_annotation_types @@ -272,6 +274,50 @@ def get_file_content(file_path: str) -> bytes: return f.read() +def assign_file_to_source_components(session, file_id, filepath): + """ + Checks all Source Components and links the file if it matches. + """ + components = session.query(SourceComponent).all() + + associations = [] + + from .report_server import get_component_values + + for component in components: + include, skip = get_component_values(component) + + # If no patterns are defined, the component matches nothing. + if not skip and not include: + continue + + is_included = False + if include: + for pattern in include: + if fnmatch.fnmatch(filepath, pattern): + is_included = True + break + else: + # If only skip is defined, it matches everything except skips. + is_included = True + + is_skipped = False + if skip: + for pattern in skip: + if fnmatch.fnmatch(filepath, pattern): + is_skipped = True + break + + if is_included and not is_skipped: + associations.append({ + 'source_component_name': component.name, + 'file_id': file_id + }) + + if associations: + session.bulk_insert_mappings(SourceComponentFile, associations) + + def add_file_record( session: DBSession, file_path: str, @@ -320,11 +366,14 @@ def add_file_record( content_hash=content_hash).on_conflict_do_nothing( index_elements=['filepath', 'content_hash']) file_id = session.execute(insert_stmt).inserted_primary_key[0] + assign_file_to_source_components(session, file_id, file_path) session.commit() return file_id file_record = File(file_path, content_hash, None, None) session.add(file_record) + session.flush() + assign_file_to_source_components(session, file_record.id, file_path) session.commit() except sqlalchemy.exc.IntegrityError as ex: LOG.error(ex) diff --git a/web/server/codechecker_server/api/report_server.py b/web/server/codechecker_server/api/report_server.py index 4cb5edc2a3..56720d4b98 100644 --- a/web/server/codechecker_server/api/report_server.py +++ b/web/server/codechecker_server/api/report_server.py @@ -69,7 +69,7 @@ File, FileContent, \ Report, ReportAnnotations, ReportAnalysisInfo, ReviewStatus, \ Run, RunHistory, RunHistoryAnalysisInfo, RunLock, \ - SourceComponent + SourceComponent, SourceComponentFile from .common import exc_to_thrift_reqfail from .thrift_enum_helper import detection_status_enum, \ @@ -146,41 +146,79 @@ def slugify(text): def get_component_values( - session: DBSession, - component_name: str + component: SourceComponent ) -> Tuple[List[str], List[str]]: """ - Get component values by component names and returns a tuple where the - first item contains a list path which should be skipped and the second - item contains a list of path which should be included. + Returns a tuple where the first item contains a list paths that should be + included and the second item contains a list of paths that should be + skipped. E.g.: +/a/b/x.cpp +/a/b/y.cpp -/a/b On the above component value this function will return the following: - (['/a/b'], ['/a/b/x.cpp', '/a/b/y.cpp']) + (['/a/b/x.cpp', '/a/b/y.cpp'], ['/a/b']) """ - components = session.query(SourceComponent) \ - .filter(SourceComponent.name.like(component_name)) \ - .all() - - skip = [] include = [] + skip = [] - for component in components: - values = component.value.decode('utf-8').split('\n') - for value in values: - value = value.strip() - if not value: - continue + values = component.value.decode('utf-8').split('\n') + for value in values: + value = value.strip() + if not value: + continue + + v = value[1:] + if value[0] == '+': + include.append(v) + elif value[0] == '-': + skip.append(v) + + return include, skip + + +def update_source_component_files( + session, + component: Optional[SourceComponent] = None +): + """ + Refreshes the SourceComponentFile table for a specific source component. + If `component` is None, then all source components are updated. + """ + if component is None: + all_components = session.query(SourceComponent) + else: + all_components = [component] + + # 1. Delete existing associations for this component + session.query(SourceComponentFile) \ + .filter(SourceComponentFile.source_component_name.in_( + map(lambda component: component.name, all_components))) \ + .delete(synchronize_session=False) + + for comp in all_components: + # 2. Re-calculate associations + include, skip = get_component_values(comp) + + file_ids_query = None + if skip and include: + include_q, skip_q = get_include_skip_queries(include, skip) + file_ids_query = include_q.except_(skip_q) + elif include: + include_q, _ = get_include_skip_queries(include, []) + file_ids_query = include_q + elif skip: + _, skip_q = get_include_skip_queries([], skip) + file_ids_query = select(File.id).where(File.id.notin_(skip_q)) - v = value[1:] - if value[0] == '+': - include.append(v) - elif value[0] == '-': - skip.append(v) + if file_ids_query is not None: + file_ids = session.execute(file_ids_query).fetchall() - return skip, include + if file_ids: + session.bulk_insert_mappings( + SourceComponentFile, + [{'source_component_name': comp.name, + 'file_id': fid[0]} for fid in file_ids]) def process_report_filter( @@ -506,18 +544,10 @@ def get_source_component_file_query( component_name: str ): """ Get filter query for a single source component. """ - skip, include = get_component_values(session, component_name) - - if skip and include: - include_q, skip_q = get_include_skip_queries(include, skip) - return File.id.in_(include_q.except_(skip_q)) - - if include: - return or_(*[File.filepath.like(conv(fp)) for fp in include]) - elif skip: - return and_(*[not_(File.filepath.like(conv(fp))) for fp in skip]) - - return None + return File.id.in_( + session.query(SourceComponentFile.file_id) + .filter(SourceComponentFile.source_component_name == component_name) + ) def get_reports_by_bugpath_filter_for_single_origin( @@ -635,28 +665,12 @@ def get_other_source_component_file_query(session): (Files NOT IN Component_1) AND (Files NOT IN Component_2) ... AND (Files NOT IN Component_N) """ - component_names = session.query(SourceComponent.name).all() - - # If there are no user defined source components we don't have to filter. - if not component_names: + # Check if there are any source components + if not session.query(SourceComponent).count(): return None - def get_query(component_name: str): - """ Get file filter query for auto generated Other component. """ - skip, include = get_component_values(session, component_name) - - if skip and include: - include_q, skip_q = get_include_skip_queries(include, skip) - return File.id.notin_(include_q.except_(skip_q)) - elif include: - return and_(*[File.filepath.notlike(conv(fp)) for fp in include]) - elif skip: - return or_(*[File.filepath.like(conv(fp)) for fp in skip]) - - return None - - queries = [get_query(n) for (n, ) in component_names] - return and_(*queries) + files_in_components = session.query(SourceComponentFile.file_id) + return File.id.notin_(files_in_components) def get_open_reports_date_filter_query(tbl=Report, date=RunHistory.time): @@ -3897,6 +3911,8 @@ def addSourceComponent(self, name, value, description): user) session.add(component) + update_source_component_files(session, component) + session.commit() return True diff --git a/web/server/codechecker_server/database/run_db_model.py b/web/server/codechecker_server/database/run_db_model.py index 27919bc47b..4d346c9513 100644 --- a/web/server/codechecker_server/database/run_db_model.py +++ b/web/server/codechecker_server/database/run_db_model.py @@ -585,6 +585,23 @@ def __init__(self, name, due_date=None, description=None, closed_at=None): self.closed_at = closed_at +class SourceComponentFile(Base): + __tablename__ = 'source_component_files' + + source_component_name = Column(String, + ForeignKey('source_components.name', + ondelete='CASCADE'), + primary_key=True) + file_id = Column(Integer, + ForeignKey('files.id', + ondelete='CASCADE'), + primary_key=True) + + def __init__(self, source_component_name, file_id): + self.source_component_name = source_component_name + self.file_id = file_id + + class CleanupPlanReportHash(Base): __tablename__ = 'cleanup_plan_report_hashes' diff --git a/web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py b/web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py new file mode 100644 index 0000000000..e8825f50d8 --- /dev/null +++ b/web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py @@ -0,0 +1,64 @@ +""" +Add SourceComponentFile join table + +Revision ID: 198654dac219 +Revises: c3dad71f8e6b +Create Date: 2026-01-14 15:43:27.225574 +""" + +from logging import getLogger + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.orm import Session + +from codechecker_server.api.report_server import \ + update_source_component_files + + +# Revision identifiers, used by Alembic. +revision = '198654dac219' +down_revision = 'c3dad71f8e6b' +branch_labels = None +depends_on = None + + +def upgrade(): + LOG = getLogger("migration/report") + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + 'source_component_files', + sa.Column('source_component_name', sa.String(), nullable=False), + sa.Column('file_id', sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ['file_id'], + ['files.id'], + name=op.f('fk_source_component_files_file_id_files'), + ondelete='CASCADE'), + sa.ForeignKeyConstraint( + ['source_component_name'], + ['source_components.name'], + name=op.f( + 'fk_source_component_files_source_component_name_' + 'source_components'), + ondelete='CASCADE'), + sa.PrimaryKeyConstraint( + 'source_component_name', + 'file_id', + name=op.f('pk_source_component_files')) + ) + + conn = op.get_bind() + + session = Session(bind=conn) + + update_source_component_files(session) + + # ### end Alembic commands ### + + +def downgrade(): + LOG = getLogger("migration/report") + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('source_component_files') + # ### end Alembic commands ### diff --git a/web/tests/functional/report_viewer_api/test_report_counting.py b/web/tests/functional/report_viewer_api/test_report_counting.py index e30e20a203..1170a052df 100644 --- a/web/tests/functional/report_viewer_api/test_report_counting.py +++ b/web/tests/functional/report_viewer_api/test_report_counting.py @@ -283,30 +283,6 @@ def test_run1_all_file(self): self.assertEqual(len(res), len(self.run1_files)) self.assertDictEqual(res, self.run1_files) - def test_filter_by_file_and_source_component(self): - """ - File and source component filter in getFileCounts(). - - Earlier this function resulted an SQL error due to an invalid SQL - statement (File table was ambiguously used, because it was joined - multiple times). - - On the other hand we test here that getFileCounts() returns the number - of reports in all files regardless the filter fields. The reason is - that it wouldn't be possible on the GUI to display the options of the - file path filter which doesn't contain a report (i.e. their endpoint is - not in that file). In the future it would be enough to ignore the - filter only if "anywhere on bugpath" option is used (TODO?). - """ - runid = self._runids[0] - run_filter = ReportFilter( - filepath="call*", - componentNames=["doesn't exist"]) - file_counts = self._cc_client.getFileCounts( - [runid], run_filter, None, None, 0) - - self.assertEqual(len(file_counts), len(self.run1_files)) - def test_run2_all_file(self): """ Get all the file counts for run2.