From cf770febfdc5ba624a491367019eb131394ba8ac Mon Sep 17 00:00:00 2001
From: bruntib <bruntib@gmail.com>
Date: Wed, 21 Jan 2026 17:01:48 +0100
Subject: [PATCH] [db] Introduce a join table between SourceComponent and File

Queries by source components are slow, because it contains a WHERE
clause that performs string pattern matching on file paths. As a
solution SourceComponentFile join table is introduced that stores which
files match to a given source component.
---
 .../codechecker_server/api/mass_store_run.py  |  51 ++++++-
 .../codechecker_server/api/report_server.py   | 128 ++++++++++--------
 .../database/run_db_model.py                  |  17 +++
 ...c219_add_sourcecomponentfile_join_table.py |  64 +++++++++
 .../report_viewer_api/test_report_counting.py |  24 ----
 5 files changed, 203 insertions(+), 81 deletions(-)
 create mode 100644 web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py

diff --git a/web/server/codechecker_server/api/mass_store_run.py b/web/server/codechecker_server/api/mass_store_run.py
index 6534ba0738..0c44436248 100644
--- a/web/server/codechecker_server/api/mass_store_run.py
+++ b/web/server/codechecker_server/api/mass_store_run.py
@@ -14,6 +14,7 @@
 import base64
 from collections import defaultdict
 from datetime import datetime, timedelta
+import fnmatch
 from hashlib import sha256
 import json
 import os
@@ -51,7 +52,8 @@
     ExtendedReportData, \
     File, FileContent, \
     Report as DBReport, ReportAnnotations, ReviewStatus as ReviewStatusRule, \
-    Run, RunLock as DBRunLock, RunHistory
+    Run, RunLock as DBRunLock, RunHistory, \
+    SourceComponent, SourceComponentFile
 from ..metadata import checker_is_unavailable, MetadataInfoParser
 
 from .report_annotations import report_annotation_types
@@ -272,6 +274,50 @@ def get_file_content(file_path: str) -> bytes:
         return f.read()
 
 
+def assign_file_to_source_components(session, file_id, filepath):
+    """
+    Checks all Source Components and links the file if it matches.
+    """
+    components = session.query(SourceComponent).all()
+
+    associations = []
+
+    from .report_server import get_component_values
+
+    for component in components:
+        include, skip = get_component_values(component)
+
+        # If no patterns are defined, the component matches nothing.
+        if not skip and not include:
+            continue
+
+        is_included = False
+        if include:
+            for pattern in include:
+                if fnmatch.fnmatch(filepath, pattern):
+                    is_included = True
+                    break
+        else:
+            # If only skip is defined, it matches everything except skips.
+            is_included = True
+
+        is_skipped = False
+        if skip:
+            for pattern in skip:
+                if fnmatch.fnmatch(filepath, pattern):
+                    is_skipped = True
+                    break
+
+        if is_included and not is_skipped:
+            associations.append({
+                'source_component_name': component.name,
+                'file_id': file_id
+            })
+
+    if associations:
+        session.bulk_insert_mappings(SourceComponentFile, associations)
+
+
 def add_file_record(
     session: DBSession,
     file_path: str,
@@ -320,11 +366,14 @@ def add_file_record(
                 content_hash=content_hash).on_conflict_do_nothing(
                     index_elements=['filepath', 'content_hash'])
             file_id = session.execute(insert_stmt).inserted_primary_key[0]
+            assign_file_to_source_components(session, file_id, file_path)
             session.commit()
             return file_id
 
         file_record = File(file_path, content_hash, None, None)
         session.add(file_record)
+        session.flush()
+        assign_file_to_source_components(session, file_record.id, file_path)
         session.commit()
     except sqlalchemy.exc.IntegrityError as ex:
         LOG.error(ex)
diff --git a/web/server/codechecker_server/api/report_server.py b/web/server/codechecker_server/api/report_server.py
index 4cb5edc2a3..56720d4b98 100644
--- a/web/server/codechecker_server/api/report_server.py
+++ b/web/server/codechecker_server/api/report_server.py
@@ -69,7 +69,7 @@
     File, FileContent, \
     Report, ReportAnnotations, ReportAnalysisInfo, ReviewStatus, \
     Run, RunHistory, RunHistoryAnalysisInfo, RunLock, \
-    SourceComponent
+    SourceComponent, SourceComponentFile
 
 from .common import exc_to_thrift_reqfail
 from .thrift_enum_helper import detection_status_enum, \
@@ -146,41 +146,79 @@ def slugify(text):
 
 
 def get_component_values(
-    session: DBSession,
-    component_name: str
+    component: SourceComponent
 ) -> Tuple[List[str], List[str]]:
     """
-    Get component values by component names and returns a tuple where the
-    first item contains a list path which should be skipped and the second
-    item contains a list of path which should be included.
+    Returns a tuple where the first item contains a list paths that should be
+    included and the second item contains a list of paths that should be
+    skipped.
     E.g.:
       +/a/b/x.cpp
       +/a/b/y.cpp
       -/a/b
     On the above component value this function will return the following:
-      (['/a/b'], ['/a/b/x.cpp', '/a/b/y.cpp'])
+      (['/a/b/x.cpp', '/a/b/y.cpp'], ['/a/b'])
     """
-    components = session.query(SourceComponent) \
-        .filter(SourceComponent.name.like(component_name)) \
-        .all()
-
-    skip = []
     include = []
+    skip = []
 
-    for component in components:
-        values = component.value.decode('utf-8').split('\n')
-        for value in values:
-            value = value.strip()
-            if not value:
-                continue
+    values = component.value.decode('utf-8').split('\n')
+    for value in values:
+        value = value.strip()
+        if not value:
+            continue
+
+        v = value[1:]
+        if value[0] == '+':
+            include.append(v)
+        elif value[0] == '-':
+            skip.append(v)
+
+    return include, skip
+
+
+def update_source_component_files(
+    session,
+    component: Optional[SourceComponent] = None
+):
+    """
+    Refreshes the SourceComponentFile table for a specific source component.
+    If `component` is None, then all source components are updated.
+    """
+    if component is None:
+        all_components = session.query(SourceComponent)
+    else:
+        all_components = [component]
+
+    # 1. Delete existing associations for this component
+    session.query(SourceComponentFile) \
+        .filter(SourceComponentFile.source_component_name.in_(
+            map(lambda component: component.name, all_components))) \
+        .delete(synchronize_session=False)
+
+    for comp in all_components:
+        # 2. Re-calculate associations
+        include, skip = get_component_values(comp)
+
+        file_ids_query = None
+        if skip and include:
+            include_q, skip_q = get_include_skip_queries(include, skip)
+            file_ids_query = include_q.except_(skip_q)
+        elif include:
+            include_q, _ = get_include_skip_queries(include, [])
+            file_ids_query = include_q
+        elif skip:
+            _, skip_q = get_include_skip_queries([], skip)
+            file_ids_query = select(File.id).where(File.id.notin_(skip_q))
 
-            v = value[1:]
-            if value[0] == '+':
-                include.append(v)
-            elif value[0] == '-':
-                skip.append(v)
+        if file_ids_query is not None:
+            file_ids = session.execute(file_ids_query).fetchall()
 
-    return skip, include
+            if file_ids:
+                session.bulk_insert_mappings(
+                    SourceComponentFile,
+                    [{'source_component_name': comp.name,
+                      'file_id': fid[0]} for fid in file_ids])
 
 
 def process_report_filter(
@@ -506,18 +544,10 @@ def get_source_component_file_query(
     component_name: str
 ):
     """ Get filter query for a single source component. """
-    skip, include = get_component_values(session, component_name)
-
-    if skip and include:
-        include_q, skip_q = get_include_skip_queries(include, skip)
-        return File.id.in_(include_q.except_(skip_q))
-
-    if include:
-        return or_(*[File.filepath.like(conv(fp)) for fp in include])
-    elif skip:
-        return and_(*[not_(File.filepath.like(conv(fp))) for fp in skip])
-
-    return None
+    return File.id.in_(
+        session.query(SourceComponentFile.file_id)
+        .filter(SourceComponentFile.source_component_name == component_name)
+    )
 
 
 def get_reports_by_bugpath_filter_for_single_origin(
@@ -635,28 +665,12 @@ def get_other_source_component_file_query(session):
         (Files NOT IN Component_1) AND (Files NOT IN Component_2) ... AND
         (Files NOT IN Component_N)
     """
-    component_names = session.query(SourceComponent.name).all()
-
-    # If there are no user defined source components we don't have to filter.
-    if not component_names:
+    # Check if there are any source components
+    if not session.query(SourceComponent).count():
         return None
 
-    def get_query(component_name: str):
-        """ Get file filter query for auto generated Other component. """
-        skip, include = get_component_values(session, component_name)
-
-        if skip and include:
-            include_q, skip_q = get_include_skip_queries(include, skip)
-            return File.id.notin_(include_q.except_(skip_q))
-        elif include:
-            return and_(*[File.filepath.notlike(conv(fp)) for fp in include])
-        elif skip:
-            return or_(*[File.filepath.like(conv(fp)) for fp in skip])
-
-        return None
-
-    queries = [get_query(n) for (n, ) in component_names]
-    return and_(*queries)
+    files_in_components = session.query(SourceComponentFile.file_id)
+    return File.id.notin_(files_in_components)
 
 
 def get_open_reports_date_filter_query(tbl=Report, date=RunHistory.time):
@@ -3897,6 +3911,8 @@ def addSourceComponent(self, name, value, description):
                                             user)
 
             session.add(component)
+            update_source_component_files(session, component)
+
             session.commit()
 
             return True
diff --git a/web/server/codechecker_server/database/run_db_model.py b/web/server/codechecker_server/database/run_db_model.py
index 27919bc47b..4d346c9513 100644
--- a/web/server/codechecker_server/database/run_db_model.py
+++ b/web/server/codechecker_server/database/run_db_model.py
@@ -585,6 +585,23 @@ def __init__(self, name, due_date=None, description=None, closed_at=None):
         self.closed_at = closed_at
 
 
+class SourceComponentFile(Base):
+    __tablename__ = 'source_component_files'
+
+    source_component_name = Column(String,
+                                   ForeignKey('source_components.name',
+                                              ondelete='CASCADE'),
+                                   primary_key=True)
+    file_id = Column(Integer,
+                     ForeignKey('files.id',
+                                ondelete='CASCADE'),
+                     primary_key=True)
+
+    def __init__(self, source_component_name, file_id):
+        self.source_component_name = source_component_name
+        self.file_id = file_id
+
+
 class CleanupPlanReportHash(Base):
     __tablename__ = 'cleanup_plan_report_hashes'
 
diff --git a/web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py b/web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py
new file mode 100644
index 0000000000..e8825f50d8
--- /dev/null
+++ b/web/server/codechecker_server/migrations/report/versions/198654dac219_add_sourcecomponentfile_join_table.py
@@ -0,0 +1,64 @@
+"""
+Add SourceComponentFile join table
+
+Revision ID: 198654dac219
+Revises:     c3dad71f8e6b
+Create Date: 2026-01-14 15:43:27.225574
+"""
+
+from logging import getLogger
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+
+from codechecker_server.api.report_server import \
+    update_source_component_files
+
+
+# Revision identifiers, used by Alembic.
+revision = '198654dac219'
+down_revision = 'c3dad71f8e6b'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    LOG = getLogger("migration/report")
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        'source_component_files',
+        sa.Column('source_component_name', sa.String(), nullable=False),
+        sa.Column('file_id', sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ['file_id'],
+            ['files.id'],
+            name=op.f('fk_source_component_files_file_id_files'),
+            ondelete='CASCADE'),
+        sa.ForeignKeyConstraint(
+            ['source_component_name'],
+            ['source_components.name'],
+            name=op.f(
+                'fk_source_component_files_source_component_name_'
+                'source_components'),
+            ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint(
+            'source_component_name',
+            'file_id',
+            name=op.f('pk_source_component_files'))
+    )
+
+    conn = op.get_bind()
+
+    session = Session(bind=conn)
+
+    update_source_component_files(session)
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    LOG = getLogger("migration/report")
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('source_component_files')
+    # ### end Alembic commands ###
diff --git a/web/tests/functional/report_viewer_api/test_report_counting.py b/web/tests/functional/report_viewer_api/test_report_counting.py
index e30e20a203..1170a052df 100644
--- a/web/tests/functional/report_viewer_api/test_report_counting.py
+++ b/web/tests/functional/report_viewer_api/test_report_counting.py
@@ -283,30 +283,6 @@ def test_run1_all_file(self):
         self.assertEqual(len(res), len(self.run1_files))
         self.assertDictEqual(res, self.run1_files)
 
-    def test_filter_by_file_and_source_component(self):
-        """
-        File and source component filter in getFileCounts().
-
-        Earlier this function resulted an SQL error due to an invalid SQL
-        statement (File table was ambiguously used, because it was joined
-        multiple times).
-
-        On the other hand we test here that getFileCounts() returns the number
-        of reports in all files regardless the filter fields. The reason is
-        that it wouldn't be possible on the GUI to display the options of the
-        file path filter which doesn't contain a report (i.e. their endpoint is
-        not in that file). In the future it would be enough to ignore the
-        filter only if "anywhere on bugpath" option is used (TODO?).
-        """
-        runid = self._runids[0]
-        run_filter = ReportFilter(
-            filepath="call*",
-            componentNames=["doesn't exist"])
-        file_counts = self._cc_client.getFileCounts(
-            [runid], run_filter, None, None, 0)
-
-        self.assertEqual(len(file_counts), len(self.run1_files))
-
     def test_run2_all_file(self):
         """
         Get all the file counts for run2.