From c097f053dae8258e4f3a79612e3c529a479f0ee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yifan=20Feng=28=E4=B8=B0=E4=B8=80=E5=B8=86=29?= Date: Mon, 16 Dec 2024 17:11:29 +0800 Subject: [PATCH] Optimize search performance and add stress test --- .github/workflows/test.yml | 33 ++++++++ .gitignore | 1 + README.md | 60 ++++++++++++-- hyperdb/__init__.py | 2 +- hyperdb/hypergraph.py | 28 +++---- performance/__init__.py | 0 performance/stress_test.py | 161 +++++++++++++++++++++++++++++++++++++ 7 files changed, 263 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100644 performance/__init__.py create mode 100644 performance/stress_test.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..dc5ee08 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,33 @@ +name: Run Tests + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + unit-tests: + runs-on: ubuntu-latest + steps: + # Step 1: Checkout the repository code + - name: Checkout code + uses: actions/checkout@v3 + + # Step 2: Setup Python environment + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.10 + + # Step 3: Install dependencies from requirements.txt + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + # Step 4: Run unit tests using pytest + - name: Run unit tests + run: pytest tests \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3d20186..51de6b6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class *.DS_Store +logs/ # C extensions diff --git a/README.md b/README.md index 2fc65b6..e2fb538 100644 --- a/README.md +++ b/README.md @@ -48,15 +48,61 @@
-## :dart: About ## +## :dart: About Hypergraph-DB is a lightweight, flexible, and Python-based database designed to model and manage **hypergraphs**—a generalized graph structure where edges (hyperedges) can connect any number of vertices. This makes Hypergraph-DB an ideal solution for representing complex relationships between entities in various domains, such as knowledge graphs, social networks, and scientific data modeling. Hypergraph-DB provides a high-level abstraction for working with vertices and hyperedges, making it easy to add, update, query, and manage hypergraph data. With built-in support for persistence, caching, and efficient operations, Hypergraph-DB simplifies the management of hypergraph data structures. +**:bar_chart: Performance Test Results** + +To demonstrate the performance of **Hypergraph-DB**, let’s consider an example: + +- Suppose we want to construct a **hypergraph** with **1,000,000 vertices** and **200,000 hyperedges**. +- Using Hypergraph-DB, it takes approximately: + - **1.75 seconds** to add **1,000,000 vertices**. + - **1.82 seconds** to add **200,000 hyperedges**. +- Querying this hypergraph: + - Retrieving information for **400,000 vertices** takes **0.51 seconds**. + - Retrieving information for **400,000 hyperedges** takes **2.52 seconds**. + +This example demonstrates the efficiency of Hypergraph-DB, even when working with large-scale hypergraphs. Below is a detailed table showing how the performance scales as the size of the hypergraph increases. + +**Detailed Performance Results** + +The following table shows the results of stress tests performed on Hypergraph-DB with varying scales. The tests measure the time taken to add vertices, add hyperedges, and query vertices and hyperedges. + +| **Number of Vertices** | **Number of Hyperedges** | **Add Vertices (s)** | **Add Edges (s)** | **Query Vertices (s/queries)** | **Query Edges (s/queries)** | **Total Time (s)** | +|-------------------------|--------------------------|-----------------------|-------------------|-------------------------------|----------------------------|--------------------| +| 5,000 | 1,000 | 0.01 | 0.01 | 0.00/2,000 | 0.01/2,000 | 0.02 | +| 10,000 | 2,000 | 0.01 | 0.01 | 0.00/4,000 | 0.02/4,000 | 0.05 | +| 25,000 | 5,000 | 0.03 | 0.04 | 0.01/10,000 | 0.05/10,000 | 0.13 | +| 50,000 | 10,000 | 0.06 | 0.07 | 0.02/20,000 | 0.12/20,000 | 0.26 | +| 100,000 | 20,000 | 0.12 | 0.17 | 0.04/40,000 | 0.24/40,000 | 0.58 | +| 250,000 | 50,000 | 0.35 | 0.40 | 0.11/100,000 | 0.61/100,000 | 1.47 | +| 500,000 | 100,000 | 0.85 | 1.07 | 0.22/200,000 | 1.20/200,000 | 3.34 | +| 1,000,000 | 200,000 | 1.75 | 1.82 | 0.51/400,000 | 2.52/400,000 | 6.60 | + +--- + +**Key Observations:** + +1. **Scalability**: + Hypergraph-DB scales efficiently with the number of vertices and hyperedges. The time to add vertices and hyperedges grows linearly with the size of the hypergraph. + +2. **Query Performance**: + Querying vertices and hyperedges remains fast, even for large-scale hypergraphs. For instance: + - Querying **200,000 vertices** takes only **0.22 seconds**. + - Querying **200,000 hyperedges** takes only **1.20 seconds**. + +3. **Total Time**: + The total time to construct and query a hypergraph with **1,000,000 vertices** and **200,000 hyperedges** is only **6.60 seconds**, showcasing the overall efficiency of Hypergraph-DB. + +This performance makes **Hypergraph-DB** a great choice for applications requiring fast and scalable hypergraph data management. + --- -## :sparkles: Features ## +## :sparkles: Features :heavy_check_mark: **Flexible Hypergraph Representation** - Supports vertices (`v`) and hyperedges (`e`), where hyperedges can connect any number of vertices. @@ -78,7 +124,7 @@ Hypergraph-DB provides a high-level abstraction for working with vertices and hy --- -## :rocket: Installation ## +## :rocket: Installation Hypergraph-DB is a Python library. You can install it directly from PyPI using `pip`. @@ -100,7 +146,7 @@ pip install -r requirements.txt --- -## :checkered_flag: Starting ## +## :checkered_flag: Starting This section provides a quick guide to get started with Hypergraph-DB, including iusage, and running basic operations. Below is an example of how to use Hypergraph-DB, based on the provided test cases. @@ -174,7 +220,7 @@ print(hg.nbr_v(1)) # Output: {3, 4} print(hg.nbr_e_of_v(1)) # Output: {(1, 3, 4)} ``` -#### **6. Persistence (Save and Load)** +#### **6. Persistence (Save and Load) ```python # Save the hypergraph to a file @@ -190,14 +236,14 @@ print(hg2.all_e) # Output: {(1, 3, 4)} --- -## :memo: License ## +## :memo: License Hypergraph-DB is open-source and licensed under the [Apache License 2.0](LICENSE). Feel free to use, modify, and distribute it as per the license terms. --- -## :email: Contact ## +## :email: Contact Hypergraph-DB is maintained by [iMoon-Lab](http://moon-lab.tech/), Tsinghua University. If you have any questions, please feel free to contact us via email: [Yifan Feng](mailto:evanfeng97@gmail.com). diff --git a/hyperdb/__init__.py b/hyperdb/__init__.py index 74fadac..58a0752 100644 --- a/hyperdb/__init__.py +++ b/hyperdb/__init__.py @@ -3,6 +3,6 @@ from ._global import AUTHOR_EMAIL -__version__ = "0.1.0" +__version__ = "0.1.1" __all__ = {"AUTHOR_EMAIL", "BaseHypergraphDB", "HypergraphDB"} diff --git a/hyperdb/hypergraph.py b/hyperdb/hypergraph.py index 3feda53..a12efab 100644 --- a/hyperdb/hypergraph.py +++ b/hyperdb/hypergraph.py @@ -112,7 +112,7 @@ def encode_e(self, e_tuple: Union[List, Set, Tuple]) -> Tuple: for v_id in tmp: assert isinstance(v_id, Hashable), "The vertex id must be hashable." assert ( - v_id in self.all_v + v_id in self._v_data ), f"The vertex {v_id} does not exist in the hypergraph." return tuple(tmp) @@ -157,7 +157,7 @@ def add_v(self, v_id: Any, v_data: Optional[Dict] = None): assert isinstance(v_data, dict), "The vertex data must be a dictionary." else: v_data = {} - if v_id not in self.all_v: + if v_id not in self._v_data: self._v_data[v_id] = v_data self._v_inci[v_id] = set() else: @@ -180,7 +180,7 @@ def add_e(self, e_tuple: Union[List, Set, Tuple], e_data: Optional[Dict] = None) else: e_data = {} e_tuple = self.encode_e(e_tuple) - if e_tuple not in self.all_e: + if e_tuple not in self._e_data: self._e_data[e_tuple] = e_data for v in e_tuple: self._v_inci[v].add(e_tuple) @@ -197,7 +197,7 @@ def remove_v(self, v_id: Any): """ assert isinstance(v_id, Hashable), "The vertex id must be hashable." assert ( - v_id in self.all_v + v_id in self._v_data ), f"The vertex {v_id} does not exist in the hypergraph." del self._v_data[v_id] for e_tuple in self._v_inci[v_id]: @@ -220,7 +220,7 @@ def remove_e(self, e_tuple: Union[List, Set, Tuple]): ), "The hyperedge must be a list, set, or tuple of vertex ids." e_tuple = self.encode_e(e_tuple) assert ( - e_tuple in self.all_e + e_tuple in self._e_data ), f"The hyperedge {e_tuple} does not exist in the hypergraph." for v in e_tuple: self._v_inci[v].remove(e_tuple) @@ -238,7 +238,7 @@ def update_v(self, v_id: Any, v_data: dict): assert isinstance(v_id, Hashable), "The vertex id must be hashable." assert isinstance(v_data, dict), "The vertex data must be a dictionary." assert ( - v_id in self.all_v + v_id in self._v_data ), f"The vertex {v_id} does not exist in the hypergraph." self._v_data[v_id].update(v_data) self._clear_cache() @@ -257,7 +257,7 @@ def update_e(self, e_tuple: Union[List, Set, Tuple], e_data: dict): assert isinstance(e_data, dict), "The hyperedge data must be a dictionary." e_tuple = self.encode_e(e_tuple) assert ( - e_tuple in self.all_e + e_tuple in self._e_data ), f"The hyperedge {e_tuple} does not exist in the hypergraph." self._e_data[e_tuple].update(e_data) self._clear_cache() @@ -270,7 +270,7 @@ def has_v(self, v_id: Any) -> bool: ``v_id`` (``Any``): The vertex id. """ assert isinstance(v_id, Hashable), "The vertex id must be hashable." - return v_id in self.all_v + return v_id in self._v_data def has_e(self, e_tuple: Union[List, Set, Tuple]) -> bool: r""" @@ -286,7 +286,7 @@ def has_e(self, e_tuple: Union[List, Set, Tuple]) -> bool: e_tuple = self.encode_e(e_tuple) except AssertionError: return False - return e_tuple in self.all_e + return e_tuple in self._e_data def degree_v(self, v_id: Any) -> int: r""" @@ -297,7 +297,7 @@ def degree_v(self, v_id: Any) -> int: """ assert isinstance(v_id, Hashable), "The vertex id must be hashable." assert ( - v_id in self.all_v + v_id in self._v_data ), f"The vertex {v_id} does not exist in the hypergraph." return len(self._v_inci[v_id]) @@ -313,7 +313,7 @@ def degree_e(self, e_tuple: Union[List, Set, Tuple]) -> int: ), "The hyperedge must be a list, set, or tuple of vertex ids." e_tuple = self.encode_e(e_tuple) assert ( - e_tuple in self.all_e + e_tuple in self._e_data ), f"The hyperedge {e_tuple} does not exist in the hypergraph." return len(e_tuple) @@ -326,7 +326,7 @@ def nbr_e_of_v(self, v_id: Any) -> list: """ assert isinstance(v_id, Hashable), "The vertex id must be hashable." assert ( - v_id in self.all_v + v_id in self._v_data ), f"The vertex {v_id} does not exist in the hypergraph." return set(self._v_inci[v_id]) @@ -342,7 +342,7 @@ def nbr_v_of_e(self, e_tuple: Union[List, Set, Tuple]) -> list: ), "The hyperedge must be a list, set, or tuple of vertex ids." e_tuple = self.encode_e(e_tuple) assert ( - e_tuple in self.all_e + e_tuple in self._e_data ), f"The hyperedge {e_tuple} does not exist in the hypergraph." return set(e_tuple) @@ -355,7 +355,7 @@ def nbr_v(self, v_id: Any, exclude_self=True) -> list: """ assert isinstance(v_id, Hashable), "The vertex id must be hashable." assert ( - v_id in self.all_v + v_id in self._v_data ), f"The vertex {v_id} does not exist in the hypergraph." nbrs = set() for e_tuple in self._v_inci[v_id]: diff --git a/performance/__init__.py b/performance/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/performance/stress_test.py b/performance/stress_test.py new file mode 100644 index 0000000..51a2e0c --- /dev/null +++ b/performance/stress_test.py @@ -0,0 +1,161 @@ +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).parent.parent)) + +import time +import random +import logging + +from hyperdb.hypergraph import HypergraphDB + + +# Configure log directory and file +log_root = Path(__file__).parent / "logs" +if not log_root.exists(): + log_root.mkdir() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + filename=log_root / "stress_test.log", + filemode="w", +) +logger = logging.getLogger(__name__) + + +def add_vertices(hg, num_vertices): + """Add multiple vertices to the hypergraph.""" + start_time = time.time() + for i in range(1, num_vertices + 1): + hg.add_v(i, {"name": f"Vertex-{i}"}) + end_time = time.time() + total_time = end_time - start_time + logger.info(f"Added {num_vertices} vertices in {total_time:.2f} seconds.") + return total_time + + +def add_edges(hg, num_edges, max_vertices): + """Add multiple hyperedges to the hypergraph.""" + start_time = time.time() + for _ in range(num_edges): + edge_size = random.randint(2, min(5, max_vertices)) # Each hyperedge contains 2 to 5 vertices + vertices = random.sample(range(1, max_vertices + 1), edge_size) + hg.add_e(tuple(vertices), {"relation": "random_edge"}) + end_time = time.time() + total_time = end_time - start_time + logger.info(f"Added {num_edges} edges in {total_time:.2f} seconds.") + return total_time + + +def query_vertices(hg, num_queries, max_vertices): + """Randomly query vertex data.""" + start_time = time.time() + for _ in range(num_queries): + v_id = random.randint(1, max_vertices) + hg.v(v_id) + end_time = time.time() + total_time = end_time - start_time + logger.info(f"Queried {num_queries} vertices in {total_time:.2f} seconds.") + return total_time + + +def query_edges(hg, num_queries, max_vertices): + """Randomly query hyperedge data.""" + start_time = time.time() + for _ in range(num_queries): + edge_size = random.randint(2, min(5, max_vertices)) + vertices = random.sample(range(1, max_vertices + 1), edge_size) + hg.e(tuple(vertices), default=None) + end_time = time.time() + total_time = end_time - start_time + logger.info(f"Queried {num_queries} edges in {total_time:.2f} seconds.") + return total_time + + +def stress_test(num_vertices=5000, num_edges=1000, num_queries=2000, scale_factor=1): + """ + Single-threaded stress test for HypergraphDB. + - Add vertices + - Add hyperedges + - Query vertices and hyperedges + """ + hg = HypergraphDB() + + # Adjust the number of vertices and edges based on the scale factor + vertices = num_vertices * scale_factor + edges = num_edges * scale_factor + queries = num_queries * scale_factor + + logger.info(f"Starting stress test with scale factor {scale_factor}...") + + # Step 1: Add vertices + vertex_add_time = add_vertices(hg, vertices) + + # Step 2: Add hyperedges + edge_add_time = add_edges(hg, edges, vertices) + + # Step 3: Query vertices + vertex_query_time = query_vertices(hg, queries, vertices) + + # Step 4: Query hyperedges + edge_query_time = query_edges(hg, queries, vertices) + + total_time = vertex_add_time + edge_add_time + vertex_query_time + edge_query_time + + # Collect test results + result = { + "vertices_added": vertices, + "edges_added": edges, + "vertex_queries": queries, + "edge_queries": queries, + "vertex_add_time": vertex_add_time, + "edge_add_time": edge_add_time, + "vertex_query_time": vertex_query_time, + "edge_query_time": edge_query_time, + "total_time": total_time, + } + + # Log test results + logger.info(f"Stress Test Completed for Scale Factor {scale_factor}:") + logger.info(f" - Vertices added: {result['vertices_added']} in {result['vertex_add_time']:.2f} seconds") + logger.info(f" - Edges added: {result['edges_added']} in {result['edge_add_time']:.2f} seconds") + logger.info(f" - Vertex queries: {result['vertex_queries']} in {result['vertex_query_time']:.2f} seconds") + logger.info(f" - Edge queries: {result['edge_queries']} in {result['edge_query_time']:.2f} seconds") + logger.info(f" - Total Time: {result['total_time']:.2f} seconds") + + # Return results for summary + return result + + +def stress_increasing_scales_test(): + """ + Perform stress tests on hypergraphs with increasing scales. + """ + scale_factors = [1, 2, 5, 10, 20, 50, 100, 200] # Increasing scale factors + results = [] + + # Collect results for each scale factor + for scale in scale_factors: + logger.info(f"Starting stress test for scale factor: {scale}") + result = stress_test(num_vertices=5000, num_edges=1000, num_queries=2000, scale_factor=scale) + results.append(result) + + # Output test results as a table + logger.info("\nSummary of Stress Test Results:\n") + logger.info(f"{'num v':<10}{'num e':<10}{'add v':<10}{'add e':<10}{'query v':<15}{'query e':<15}{'total time':<10}") + logger.info("-" * 80) + for result in results: + logger.info( + f"{result['vertices_added']:<10}" + f"{result['edges_added']:<10}" + f"{result['vertex_add_time']:<10.2f}" + f"{result['edge_add_time']:<10.2f}" + f"{result['vertex_query_time']:.2f}/{result['vertex_queries']:<10}" + f"{result['edge_query_time']:.2f}/{result['edge_queries']:<10}" + f"{result['total_time']:<10.2f}" + ) + + +if __name__ == "__main__": + stress_increasing_scales_test() \ No newline at end of file