From f05de3e2b8bceb9c6c8ea35ef79995b39f951858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barnab=C3=A1s=20Domozi?= Date: Tue, 12 Aug 2025 12:51:45 +0000 Subject: [PATCH 1/3] Introduce cache database and reindex command --- analyzer/codechecker_analyzer/cachedb.py | 99 ++++++++++++++++ analyzer/codechecker_analyzer/cli/reindex.py | 108 ++++++++++++++++++ docs/analyzer/user_guide.md | 31 +++++ .../report/parser/plist.py | 16 ++- 4 files changed, 251 insertions(+), 3 deletions(-) create mode 100644 analyzer/codechecker_analyzer/cachedb.py create mode 100644 analyzer/codechecker_analyzer/cli/reindex.py diff --git a/analyzer/codechecker_analyzer/cachedb.py b/analyzer/codechecker_analyzer/cachedb.py new file mode 100644 index 0000000000..07ecfb4a3e --- /dev/null +++ b/analyzer/codechecker_analyzer/cachedb.py @@ -0,0 +1,99 @@ +# ------------------------------------------------------------------------- +# +# Part of the CodeChecker project, under the Apache License v2.0 with +# LLVM Exceptions. See LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ------------------------------------------------------------------------- + +import sqlite3 +import itertools +import os +from typing import List + + +class CacheDB: + """ + SQLite database located in the report directory, + designed to speed up the parsing process. + """ + + __sqlitedb_path: str + __con: sqlite3.Connection + __cur: sqlite3.Cursor + + def __init__(self, report_dir: str, clean: bool = False): + """ + Initiates the cache database and creates the necessary tables. + + Args: + report_dir (str): path to the report directory + clean (bool): If set to True, the previous database + will be dropped and a new one is created. + """ + self.__sqlitedb_path = os.path.join(report_dir, "cache.sqlite") + + if clean and os.path.exists(self.__sqlitedb_path): + os.remove(self.__sqlitedb_path) + + self.__create_connection() + + def __create_connection(self): + self.__con = sqlite3.connect(self.__sqlitedb_path) + self.__cur = self.__con.cursor() + self.__create_tables() + + def close_connection(self): + """ + Closes the connection to the cache database and writes + changes to the disk. + """ + self.__con.close() + + def __table_exists(self, name: str) -> bool: + res = self.__cur.execute("SELECT name FROM sqlite_master WHERE name=?", + [name]) + return res.fetchone() is not None + + def __create_tables(self): + if not self.__table_exists("plist_lookup"): + self.__cur.execute("CREATE TABLE plist_lookup" + "(plist TEXT, source TEXT)") + + def insert_plist_sources(self, plist_file: str, source_files: List[str]): + """ + Inserts the plist file and its associated source files into the + cache database. These source files are located in the 'files' section + of an individual plist file. + + Args: + plist_file (str): path to the plist file + source_files (List[str]): list of source files mapped to + the plist file + """ + + data = list(zip(itertools.repeat(plist_file), source_files)) + self.__cur.executemany("INSERT INTO plist_lookup VALUES(?, ?)", data) + self.__con.commit() + + def plist_query(self, source_files: List[str]) -> List[str]: + """ + Returns all plist files associated with any of the given source files + by querying the cache database. + + Args: + source_files (List[str]): list of source files to be looked up + from the cache database. + """ + + placeholders = ','.join('?' for _ in source_files) + res = self.__cur.execute("SELECT plist FROM plist_lookup WHERE source" + f" IN ({placeholders})", source_files) + return list(map(lambda e: e[0], res)) + + def get_indexed_plist_files(self) -> List[str]: + """ + Returns already indexed plist files from the cache database. + """ + res = self.__cur.execute("SELECT DISTINCT plist FROM plist_lookup") + return list(map(lambda e: e[0], res)) diff --git a/analyzer/codechecker_analyzer/cli/reindex.py b/analyzer/codechecker_analyzer/cli/reindex.py new file mode 100644 index 0000000000..77a6419656 --- /dev/null +++ b/analyzer/codechecker_analyzer/cli/reindex.py @@ -0,0 +1,108 @@ +# ------------------------------------------------------------------------- +# +# Part of the CodeChecker project, under the Apache License v2.0 with +# LLVM Exceptions. See LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ------------------------------------------------------------------------- + +import os +import sys +from codechecker_analyzer.cachedb import CacheDB +from codechecker_common import arg, logger +from codechecker_common.compatibility.multiprocessing import Pool, cpu_count +from codechecker_report_converter.report.parser import plist as plistparser +from typing import List, Tuple + +LOG = logger.get_logger('system') + + +def get_argparser_ctor_args(): + """ + This method returns a dict containing the kwargs for constructing an + argparse.ArgumentParser (either directly or as a subparser). + """ + + return { + 'prog': 'CodeChecker reindex', + 'formatter_class': arg.RawDescriptionDefaultHelpFormatter, + 'description': """ +The analysis cache database is a SQLite database located in the +report directory, designed to speed up the parsing process. +In case it is missing or outdated, one can use the 'reindex' command to +recreate/update this database.""", + 'help': "Recreate/update the cache database given a report directory." + } + + +def add_arguments_to_parser(parser): + """ + Add the subcommand's arguments to the given argparse.ArgumentParser. + """ + + parser.add_argument('input', + type=str, + nargs='+', + metavar='folder', + help="The analysis result folder(s) containing " + "analysis results which should be " + "reindexed.") + + parser.add_argument('-j', '--jobs', + type=int, + dest="jobs", + required=False, + default=cpu_count(), + help="Number of threads to use for reindex. More " + "threads mean faster reindex at the cost of " + "using more memory.") + + parser.add_argument('-f', '--force', + action="store_true", + dest="force", + required=False, + default=False, + help="Drop the previous cache database and do a " + "clean reindex.") + + logger.add_verbose_arguments(parser) + parser.set_defaults(func=main) + + +def main(args): + logger.setup_logger(args.verbose if 'verbose' in args else None) + for i in args.input: + update_cache_db(i, args.force, args.jobs) + + +def __process_file(file_path: str) -> Tuple[str, List[str]]: + with open(file_path, 'rb') as fp: + plist = plistparser.parse(fp) + + file_list = [] if plist is None else \ + plistparser.get_file_list(plist, os.path.dirname(file_path)) + return (file_path, file_list) + + +def update_cache_db(report_dir: str, force: bool, jobs: int): + if not os.path.isdir(report_dir): + LOG.error("Directory %s does not exist!", report_dir) + sys.exit(1) + + report_dir = os.path.abspath(report_dir) + cachedb = CacheDB(report_dir, force) + indexed_files = cachedb.get_indexed_plist_files() + + plist_files = filter(lambda f: f.endswith( + plistparser.EXTENSION), os.listdir(report_dir)) + plist_files = map(lambda f: os.path.abspath( + os.path.join(report_dir, f)), plist_files) + plist_files = list(filter(lambda f: f not in indexed_files, plist_files)) + + with Pool(jobs) as p: + res = p.map(__process_file, plist_files) + for (plist_file, sources) in res: + if sources != []: + cachedb.insert_plist_sources(plist_file, sources) + + cachedb.close_connection() diff --git a/docs/analyzer/user_guide.md b/docs/analyzer/user_guide.md index 65340eb516..ad50dd3b1a 100644 --- a/docs/analyzer/user_guide.md +++ b/docs/analyzer/user_guide.md @@ -46,6 +46,7 @@ - [`fixit`](#fixit) - [`checkers`](#checkers) - [`analyzers`](#analyzers) + - [`reindex`](#reindex) - [Configuring Clang version](#configuring-clang-version) - [Review status handling](#review-status-handling) - [Setting with source code comments](#setting-with-source-code-comments) @@ -2539,6 +2540,36 @@ A detailed view of the available analyzers is available via `--details`. In the A machine-readable `csv` or `json` output can be generated by supplying the `--output csv` or `--output json` argument. +### `reindex` + +`reindex` is used to recreate/update the analysis cache database. +This cache database is used to speed up the parsing process. + +
+ + $ CodeChecker reindex --help (click to expand) + + +``` +usage: CodeChecker reindex [-h] [-j JOBS] [-f] [--verbose {info,debug_analyzer,debug}] folder [folder ...] + +The analysis cache database is a SQLite database located in the +report directory, designed to speed up the parsing process. +In case it is missing or outdated, one can use the 'reindex' command to +recreate/update this database. + +positional arguments: + folder The analysis result folder(s) containing analysis results which should be reindexed. + +options: + -h, --help show this help message and exit + -j JOBS, --jobs JOBS Number of threads to use for reindex. More threads mean faster reindex at the cost of using more memory. (default: 16) + -f, --force Drop the previous cache database and do a clean reindex. (default: False) + --verbose {info,debug_analyzer,debug} + Set verbosity level. +``` +
+ ## Configuring Clang version _Clang_ and/or _Clang-Tidy_ must be available on your system before you can diff --git a/tools/report-converter/codechecker_report_converter/report/parser/plist.py b/tools/report-converter/codechecker_report_converter/report/parser/plist.py index 3f79878f06..b0c3dd06e0 100644 --- a/tools/report-converter/codechecker_report_converter/report/parser/plist.py +++ b/tools/report-converter/codechecker_report_converter/report/parser/plist.py @@ -178,15 +178,25 @@ def get_file_index_map( ) -> Dict[int, File]: """ Get file index map from the given plist object. """ file_index_map: Dict[int, File] = {} + file_list = get_file_list(plist, source_dir_path) - for i, orig_file_path in enumerate(plist.get('files', [])): - file_path = os.path.normpath(os.path.join( - source_dir_path, orig_file_path)) + for i, file_path in enumerate(file_list): file_index_map[i] = get_or_create_file(file_path, file_cache) return file_index_map +def get_file_list( + plist: Any, + source_dir_path: str +) -> List[str]: + """ Get file list section from the given plist object. """ + return list(map( + lambda f: os.path.normpath(os.path.join( + source_dir_path, f)), + plist.get('files', []))) + + class Parser(BaseParser): def get_reports( self, From d71e9af223db95e17989ef997ccd3fed2f208675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barnab=C3=A1s=20Domozi?= Date: Thu, 4 Sep 2025 19:24:22 +0200 Subject: [PATCH 2/3] Added unit test for CacheDB. --- analyzer/codechecker_analyzer/cachedb.py | 2 +- analyzer/tests/unit/test_cachedb.py | 88 ++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 analyzer/tests/unit/test_cachedb.py diff --git a/analyzer/codechecker_analyzer/cachedb.py b/analyzer/codechecker_analyzer/cachedb.py index 07ecfb4a3e..dad93981d5 100644 --- a/analyzer/codechecker_analyzer/cachedb.py +++ b/analyzer/codechecker_analyzer/cachedb.py @@ -89,7 +89,7 @@ def plist_query(self, source_files: List[str]) -> List[str]: placeholders = ','.join('?' for _ in source_files) res = self.__cur.execute("SELECT plist FROM plist_lookup WHERE source" f" IN ({placeholders})", source_files) - return list(map(lambda e: e[0], res)) + return list(set(map(lambda e: e[0], res))) def get_indexed_plist_files(self) -> List[str]: """ diff --git a/analyzer/tests/unit/test_cachedb.py b/analyzer/tests/unit/test_cachedb.py new file mode 100644 index 0000000000..a05c605a92 --- /dev/null +++ b/analyzer/tests/unit/test_cachedb.py @@ -0,0 +1,88 @@ +# ------------------------------------------------------------------------- +# +# Part of the CodeChecker project, under the Apache License v2.0 with +# LLVM Exceptions. See LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ------------------------------------------------------------------------- + +"""Tests analyzer CacheDB operations""" + +import unittest +import shutil +import os +from libtest import env +from codechecker_analyzer.cachedb import CacheDB + + +class CacheDBTest(unittest.TestCase): + """Tests analyzer CacheDB operations""" + + def setup_class(self): + """Setup the environment for the tests.""" + + global TEST_WORKSPACE + + TEST_WORKSPACE = env.get_workspace('cachedb') + self.test_workspace = TEST_WORKSPACE + + def teardown_class(self): + """Delete the workspace associated with this test""" + + global TEST_WORKSPACE + + print("Removing: " + TEST_WORKSPACE) + shutil.rmtree(TEST_WORKSPACE) + + def insert_dummy_data(self, cachedb: CacheDB): + cachedb.insert_plist_sources( + "foo.plist", ["a.h", "a.c", "b.c"]) + cachedb.insert_plist_sources( + "bar.plist", ["c.c", "a.c", "b.h"]) + + def test_cachedb_creation(self): + """Tests if the SQLite database was created""" + + CacheDB(self.test_workspace, True) + db_path = os.path.join(self.test_workspace, "cache.sqlite") + self.assertTrue(os.path.isfile(db_path)) + + def test_cachedb_insert(self): + + cachedb = CacheDB(self.test_workspace, True) + self.insert_dummy_data(cachedb) + + self.assertCountEqual(cachedb.get_indexed_plist_files(), + ["foo.plist", "bar.plist"]) + + def test_cachedb_insert_with_closing(self): + + cachedb = CacheDB(self.test_workspace, True) + self.insert_dummy_data(cachedb) + cachedb.close_connection() + + cachedb = CacheDB(self.test_workspace) + self.assertCountEqual(cachedb.get_indexed_plist_files(), + ["foo.plist", "bar.plist"]) + + def test_cachedb_querying(self): + cachedb = CacheDB(self.test_workspace, True) + self.insert_dummy_data(cachedb) + + self.assertCountEqual(cachedb.plist_query(["a.h"]), + ["foo.plist"]) + self.assertCountEqual(cachedb.plist_query(["c.c"]), + ["bar.plist"]) + self.assertCountEqual(cachedb.plist_query(["a.c"]), + ["foo.plist", "bar.plist"]) + + def test_cachedb_querying2(self): + cachedb = CacheDB(self.test_workspace, True) + self.insert_dummy_data(cachedb) + + self.assertCountEqual(cachedb.plist_query(["a.h", "c.c"]), + ["foo.plist", "bar.plist"]) + self.assertCountEqual(cachedb.plist_query(["a.h", "c.c", "b.h"]), + ["foo.plist", "bar.plist"]) + self.assertCountEqual(cachedb.plist_query(["c.c", "b.h"]), + ["bar.plist"]) From 826ee6a45e5c228b5ac422f55affdae18e3a328c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barnab=C3=A1s=20Domozi?= Date: Wed, 10 Sep 2025 18:15:44 +0200 Subject: [PATCH 3/3] Added functional tests for reindex command --- analyzer/tests/functional/reindex/__init__.py | 11 ++ .../tests/functional/reindex/test_files/a.c | 8 ++ .../tests/functional/reindex/test_files/a.h | 4 + .../tests/functional/reindex/test_reindex.py | 127 ++++++++++++++++++ 4 files changed, 150 insertions(+) create mode 100644 analyzer/tests/functional/reindex/__init__.py create mode 100644 analyzer/tests/functional/reindex/test_files/a.c create mode 100644 analyzer/tests/functional/reindex/test_files/a.h create mode 100644 analyzer/tests/functional/reindex/test_reindex.py diff --git a/analyzer/tests/functional/reindex/__init__.py b/analyzer/tests/functional/reindex/__init__.py new file mode 100644 index 0000000000..0b0114f7e4 --- /dev/null +++ b/analyzer/tests/functional/reindex/__init__.py @@ -0,0 +1,11 @@ +# coding=utf-8 +# ------------------------------------------------------------------------- +# +# Part of the CodeChecker project, under the Apache License v2.0 with +# LLVM Exceptions. See LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ------------------------------------------------------------------------- + +# This file is empty, and is only present so that this directory will form a +# package. diff --git a/analyzer/tests/functional/reindex/test_files/a.c b/analyzer/tests/functional/reindex/test_files/a.c new file mode 100644 index 0000000000..6f75d2e499 --- /dev/null +++ b/analyzer/tests/functional/reindex/test_files/a.c @@ -0,0 +1,8 @@ +#include +#include "a.h" + +int main() +{ + int a = foo(); + return 1 / 0; +} diff --git a/analyzer/tests/functional/reindex/test_files/a.h b/analyzer/tests/functional/reindex/test_files/a.h new file mode 100644 index 0000000000..4172a073da --- /dev/null +++ b/analyzer/tests/functional/reindex/test_files/a.h @@ -0,0 +1,4 @@ +int foo() +{ + return 1 / 0; +} diff --git a/analyzer/tests/functional/reindex/test_reindex.py b/analyzer/tests/functional/reindex/test_reindex.py new file mode 100644 index 0000000000..761279ad9a --- /dev/null +++ b/analyzer/tests/functional/reindex/test_reindex.py @@ -0,0 +1,127 @@ +# +# ------------------------------------------------------------------------- +# +# Part of the CodeChecker project, under the Apache License v2.0 with +# LLVM Exceptions. See LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ------------------------------------------------------------------------- + +""" +Test reindex functionality. +""" + +import os +import shutil +import subprocess +import unittest + +from libtest import env +from codechecker_analyzer.cachedb import CacheDB + + +class TestReindex(unittest.TestCase): + _ccClient = None + + def setup_class(self): + """Setup the environment for the tests.""" + + global TEST_WORKSPACE + TEST_WORKSPACE = env.get_workspace('reindex') + + report_dir = os.path.join(TEST_WORKSPACE, 'reports') + os.makedirs(report_dir) + + os.environ['TEST_WORKSPACE'] = TEST_WORKSPACE + + def teardown_class(self): + """Delete the workspace associated with this test""" + + # TODO: If environment variable is set keep the workspace + # and print out the path. + global TEST_WORKSPACE + + print("Removing: " + TEST_WORKSPACE) + shutil.rmtree(TEST_WORKSPACE) + + def setup_method(self, _): + + # TEST_WORKSPACE is automatically set by test package __init__.py . + self.test_workspace = os.environ['TEST_WORKSPACE'] + + test_class = self.__class__.__name__ + print('Running ' + test_class + ' tests in ' + self.test_workspace) + + # Get the CodeChecker cmd if needed for the tests. + self._codechecker_cmd = env.codechecker_cmd() + self._tu_collector_cmd = env.tu_collector_cmd() + self.report_dir = os.path.join(self.test_workspace, "reports") + self.test_dir = os.path.join(os.path.dirname(__file__), 'test_files') + + def __run_cmd(self, cmd, cwd): + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=cwd, + encoding="utf-8", + errors="ignore") + out, err = process.communicate() + print(out) + print(err) + self.assertEqual(process.returncode, 0) + + def test_reindex(self): + build_json = os.path.join(self.test_workspace, "build.json") + + # Create and run log command + log_cmd = [self._codechecker_cmd, "log", "-b", "gcc a.c", + "-o", build_json] + self.__run_cmd(log_cmd, self.test_dir) + + # Create and run analyze command + analyze_cmd = [ + self._codechecker_cmd, "analyze", "-c", build_json, + "--analyzers", "clangsa", "-o", self.report_dir] + self.__run_cmd(analyze_cmd, self.test_dir) + + plist_files_in_report_dir = [ + os.path.join(self.report_dir, f) + for f in os.listdir(self.report_dir) + if os.path.splitext(f)[1] == ".plist"] + + # Check if there are plist files in report_dir + self.assertGreaterEqual(len(plist_files_in_report_dir), 1) + + a_c_clangsa_plist = None + for f in plist_files_in_report_dir: + if "a.c_clangsa" in f.split("/")[-1]: + a_c_clangsa_plist = f + break + + # Check if a.c_clangsa plist was found + self.assertIsNotNone(a_c_clangsa_plist) + + # Create and run reindex command + reindex_cmd = [ + self._codechecker_cmd, "reindex", "-f", self.report_dir] + self.__run_cmd(reindex_cmd, self.test_dir) + + # Check if CacheDB was created + self.assertTrue(os.path.isfile( + os.path.join(self.report_dir, "cache.sqlite"))) + + # Load CacheDB + cachedb = CacheDB(self.report_dir) + + # Check if a.c_clangsa plist was indexed by the reindex command + self.assertIn(a_c_clangsa_plist, cachedb.get_indexed_plist_files()) + + source_files_in_test_dir = [ + os.path.join(self.test_dir, f) + for f in os.listdir(self.test_dir) + if os.path.splitext(f)[1] in [".c", ".h"]] + + # Check if source files were mapped to a.c_clangsa plist + for f in source_files_in_test_dir: + self.assertIn(a_c_clangsa_plist, cachedb.plist_query([f]))