From c097f053dae8258e4f3a79612e3c529a479f0ee6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yifan=20Feng=28=E4=B8=B0=E4=B8=80=E5=B8=86=29?=
 <evanfeng97@gmail.com>
Date: Mon, 16 Dec 2024 17:11:29 +0800
Subject: [PATCH] Optimize search performance and add stress test

---
 .github/workflows/test.yml |  33 ++++++++
 .gitignore                 |   1 +
 README.md                  |  60 ++++++++++++--
 hyperdb/__init__.py        |   2 +-
 hyperdb/hypergraph.py      |  28 +++----
 performance/__init__.py    |   0
 performance/stress_test.py | 161 +++++++++++++++++++++++++++++++++++++
 7 files changed, 263 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/test.yml
 create mode 100644 performance/__init__.py
 create mode 100644 performance/stress_test.py
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..dc5ee08
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,33 @@
+name: Run Tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-latest
+    steps:
+      # Step 1: Checkout the repository code
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      # Step 2: Setup Python environment
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.10
+
+      # Step 3: Install dependencies from requirements.txt
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      # Step 4: Run unit tests using pytest
+      - name: Run unit tests
+        run: pytest tests
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 3d20186..51de6b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 *.DS_Store
+logs/
 
 
 # C extensions
diff --git a/README.md b/README.md
index 2fc65b6..e2fb538 100644
--- a/README.md
+++ b/README.md
@@ -48,15 +48,61 @@
 
 <br>
 
-## :dart: About ##
+## :dart: About 
 
 Hypergraph-DB is a lightweight, flexible, and Python-based database designed to model and manage **hypergraphs**—a generalized graph structure where edges (hyperedges) can connect any number of vertices. This makes Hypergraph-DB an ideal solution for representing complex relationships between entities in various domains, such as knowledge graphs, social networks, and scientific data modeling.
 
 Hypergraph-DB provides a high-level abstraction for working with vertices and hyperedges, making it easy to add, update, query, and manage hypergraph data. With built-in support for persistence, caching, and efficient operations, Hypergraph-DB simplifies the management of hypergraph data structures.
 
+**:bar_chart: Performance Test Results**
+
+To demonstrate the performance of **Hypergraph-DB**, let’s consider an example:
+
+- Suppose we want to construct a **hypergraph** with **1,000,000 vertices** and **200,000 hyperedges**.
+- Using Hypergraph-DB, it takes approximately:
+  - **1.75 seconds** to add **1,000,000 vertices**.
+  - **1.82 seconds** to add **200,000 hyperedges**.
+- Querying this hypergraph:
+  - Retrieving information for **400,000 vertices** takes **0.51 seconds**.
+  - Retrieving information for **400,000 hyperedges** takes **2.52 seconds**.
+
+This example demonstrates the efficiency of Hypergraph-DB, even when working with large-scale hypergraphs. Below is a detailed table showing how the performance scales as the size of the hypergraph increases.
+
+**Detailed Performance Results**
+
+The following table shows the results of stress tests performed on Hypergraph-DB with varying scales. The tests measure the time taken to add vertices, add hyperedges, and query vertices and hyperedges.
+
+| **Number of Vertices** | **Number of Hyperedges** | **Add Vertices (s)** | **Add Edges (s)** | **Query Vertices (s/queries)** | **Query Edges (s/queries)** | **Total Time (s)** |
+|-------------------------|--------------------------|-----------------------|-------------------|-------------------------------|----------------------------|--------------------|
+| 5,000                  | 1,000                   | 0.01                 | 0.01             | 0.00/2,000                   | 0.01/2,000                | 0.02               |
+| 10,000                 | 2,000                   | 0.01                 | 0.01             | 0.00/4,000                   | 0.02/4,000                | 0.05               |
+| 25,000                 | 5,000                   | 0.03                 | 0.04             | 0.01/10,000                  | 0.05/10,000               | 0.13               |
+| 50,000                 | 10,000                  | 0.06                 | 0.07             | 0.02/20,000                  | 0.12/20,000               | 0.26               |
+| 100,000                | 20,000                  | 0.12                 | 0.17             | 0.04/40,000                  | 0.24/40,000               | 0.58               |
+| 250,000                | 50,000                  | 0.35                 | 0.40             | 0.11/100,000                 | 0.61/100,000              | 1.47               |
+| 500,000                | 100,000                 | 0.85                 | 1.07             | 0.22/200,000                 | 1.20/200,000              | 3.34               |
+| 1,000,000              | 200,000                 | 1.75                 | 1.82             | 0.51/400,000                 | 2.52/400,000              | 6.60               |
+
+---
+
+**Key Observations:**
+
+1. **Scalability**:  
+   Hypergraph-DB scales efficiently with the number of vertices and hyperedges. The time to add vertices and hyperedges grows linearly with the size of the hypergraph.
+
+2. **Query Performance**:  
+   Querying vertices and hyperedges remains fast, even for large-scale hypergraphs. For instance:
+   - Querying **200,000 vertices** takes only **0.22 seconds**.
+   - Querying **200,000 hyperedges** takes only **1.20 seconds**.
+
+3. **Total Time**:  
+   The total time to construct and query a hypergraph with **1,000,000 vertices** and **200,000 hyperedges** is only **6.60 seconds**, showcasing the overall efficiency of Hypergraph-DB.
+
+This performance makes **Hypergraph-DB** a great choice for applications requiring fast and scalable hypergraph data management.
+
 ---
 
-## :sparkles: Features ##
+## :sparkles: Features 
 
 :heavy_check_mark: **Flexible Hypergraph Representation**  
    - Supports vertices (`v`) and hyperedges (`e`), where hyperedges can connect any number of vertices.
@@ -78,7 +124,7 @@ Hypergraph-DB provides a high-level abstraction for working with vertices and hy
 
 ---
 
-## :rocket: Installation ##
+## :rocket: Installation 
 
 
 Hypergraph-DB is a Python library. You can install it directly from PyPI using `pip`.
@@ -100,7 +146,7 @@ pip install -r requirements.txt
 
 ---
 
-## :checkered_flag: Starting ##
+## :checkered_flag: Starting 
 
 This section provides a quick guide to get started with Hypergraph-DB, including iusage, and running basic operations. Below is an example of how to use Hypergraph-DB, based on the provided test cases.
 
@@ -174,7 +220,7 @@ print(hg.nbr_v(1))  # Output: {3, 4}
 print(hg.nbr_e_of_v(1))  # Output: {(1, 3, 4)}
 ```
 
-#### **6. Persistence (Save and Load)**
+#### **6. Persistence (Save and Load)
 
 ```python
 # Save the hypergraph to a file
@@ -190,14 +236,14 @@ print(hg2.all_e)  # Output: {(1, 3, 4)}
 --- 
 
 
-## :memo: License ##
+## :memo: License 
 
 Hypergraph-DB is open-source and licensed under the [Apache License 2.0](LICENSE). Feel free to use, modify, and distribute it as per the license terms.
 
 
 ---
 
-## :email: Contact ##
+## :email: Contact 
 
 Hypergraph-DB is maintained by [iMoon-Lab](http://moon-lab.tech/), Tsinghua University. If you have any questions, please feel free to contact us via email: [Yifan Feng](mailto:evanfeng97@gmail.com).
 
diff --git a/hyperdb/__init__.py b/hyperdb/__init__.py
index 74fadac..58a0752 100644
--- a/hyperdb/__init__.py
+++ b/hyperdb/__init__.py
@@ -3,6 +3,6 @@
 
 from ._global import AUTHOR_EMAIL
 
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 
 __all__ = {"AUTHOR_EMAIL", "BaseHypergraphDB", "HypergraphDB"}
diff --git a/hyperdb/hypergraph.py b/hyperdb/hypergraph.py
index 3feda53..a12efab 100644
--- a/hyperdb/hypergraph.py
+++ b/hyperdb/hypergraph.py
@@ -112,7 +112,7 @@ def encode_e(self, e_tuple: Union[List, Set, Tuple]) -> Tuple:
         for v_id in tmp:
             assert isinstance(v_id, Hashable), "The vertex id must be hashable."
             assert (
-                v_id in self.all_v
+                v_id in self._v_data
             ), f"The vertex {v_id} does not exist in the hypergraph."
         return tuple(tmp)
 
@@ -157,7 +157,7 @@ def add_v(self, v_id: Any, v_data: Optional[Dict] = None):
             assert isinstance(v_data, dict), "The vertex data must be a dictionary."
         else:
             v_data = {}
-        if v_id not in self.all_v:
+        if v_id not in self._v_data:
             self._v_data[v_id] = v_data
             self._v_inci[v_id] = set()
         else:
@@ -180,7 +180,7 @@ def add_e(self, e_tuple: Union[List, Set, Tuple], e_data: Optional[Dict] = None)
         else:
             e_data = {}
         e_tuple = self.encode_e(e_tuple)
-        if e_tuple not in self.all_e:
+        if e_tuple not in self._e_data:
             self._e_data[e_tuple] = e_data
             for v in e_tuple:
                 self._v_inci[v].add(e_tuple)
@@ -197,7 +197,7 @@ def remove_v(self, v_id: Any):
         """
         assert isinstance(v_id, Hashable), "The vertex id must be hashable."
         assert (
-            v_id in self.all_v
+            v_id in self._v_data
         ), f"The vertex {v_id} does not exist in the hypergraph."
         del self._v_data[v_id]
         for e_tuple in self._v_inci[v_id]:
@@ -220,7 +220,7 @@ def remove_e(self, e_tuple: Union[List, Set, Tuple]):
         ), "The hyperedge must be a list, set, or tuple of vertex ids."
         e_tuple = self.encode_e(e_tuple)
         assert (
-            e_tuple in self.all_e
+            e_tuple in self._e_data
         ), f"The hyperedge {e_tuple} does not exist in the hypergraph."
         for v in e_tuple:
             self._v_inci[v].remove(e_tuple)
@@ -238,7 +238,7 @@ def update_v(self, v_id: Any, v_data: dict):
         assert isinstance(v_id, Hashable), "The vertex id must be hashable."
         assert isinstance(v_data, dict), "The vertex data must be a dictionary."
         assert (
-            v_id in self.all_v
+            v_id in self._v_data
         ), f"The vertex {v_id} does not exist in the hypergraph."
         self._v_data[v_id].update(v_data)
         self._clear_cache()
@@ -257,7 +257,7 @@ def update_e(self, e_tuple: Union[List, Set, Tuple], e_data: dict):
         assert isinstance(e_data, dict), "The hyperedge data must be a dictionary."
         e_tuple = self.encode_e(e_tuple)
         assert (
-            e_tuple in self.all_e
+            e_tuple in self._e_data
         ), f"The hyperedge {e_tuple} does not exist in the hypergraph."
         self._e_data[e_tuple].update(e_data)
         self._clear_cache()
@@ -270,7 +270,7 @@ def has_v(self, v_id: Any) -> bool:
             ``v_id`` (``Any``): The vertex id.
         """
         assert isinstance(v_id, Hashable), "The vertex id must be hashable."
-        return v_id in self.all_v
+        return v_id in self._v_data
 
     def has_e(self, e_tuple: Union[List, Set, Tuple]) -> bool:
         r"""
@@ -286,7 +286,7 @@ def has_e(self, e_tuple: Union[List, Set, Tuple]) -> bool:
             e_tuple = self.encode_e(e_tuple)
         except AssertionError:
             return False
-        return e_tuple in self.all_e
+        return e_tuple in self._e_data
 
     def degree_v(self, v_id: Any) -> int:
         r"""
@@ -297,7 +297,7 @@ def degree_v(self, v_id: Any) -> int:
         """
         assert isinstance(v_id, Hashable), "The vertex id must be hashable."
         assert (
-            v_id in self.all_v
+            v_id in self._v_data
         ), f"The vertex {v_id} does not exist in the hypergraph."
         return len(self._v_inci[v_id])
 
@@ -313,7 +313,7 @@ def degree_e(self, e_tuple: Union[List, Set, Tuple]) -> int:
         ), "The hyperedge must be a list, set, or tuple of vertex ids."
         e_tuple = self.encode_e(e_tuple)
         assert (
-            e_tuple in self.all_e
+            e_tuple in self._e_data
         ), f"The hyperedge {e_tuple} does not exist in the hypergraph."
         return len(e_tuple)
 
@@ -326,7 +326,7 @@ def nbr_e_of_v(self, v_id: Any) -> list:
         """
         assert isinstance(v_id, Hashable), "The vertex id must be hashable."
         assert (
-            v_id in self.all_v
+            v_id in self._v_data
         ), f"The vertex {v_id} does not exist in the hypergraph."
         return set(self._v_inci[v_id])
 
@@ -342,7 +342,7 @@ def nbr_v_of_e(self, e_tuple: Union[List, Set, Tuple]) -> list:
         ), "The hyperedge must be a list, set, or tuple of vertex ids."
         e_tuple = self.encode_e(e_tuple)
         assert (
-            e_tuple in self.all_e
+            e_tuple in self._e_data
         ), f"The hyperedge {e_tuple} does not exist in the hypergraph."
         return set(e_tuple)
 
@@ -355,7 +355,7 @@ def nbr_v(self, v_id: Any, exclude_self=True) -> list:
         """
         assert isinstance(v_id, Hashable), "The vertex id must be hashable."
         assert (
-            v_id in self.all_v
+            v_id in self._v_data
         ), f"The vertex {v_id} does not exist in the hypergraph."
         nbrs = set()
         for e_tuple in self._v_inci[v_id]:
diff --git a/performance/__init__.py b/performance/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/performance/stress_test.py b/performance/stress_test.py
new file mode 100644
index 0000000..51a2e0c
--- /dev/null
+++ b/performance/stress_test.py
@@ -0,0 +1,161 @@
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).parent.parent))
+
+import time
+import random
+import logging
+
+from hyperdb.hypergraph import HypergraphDB
+
+
+# Configure log directory and file
+log_root = Path(__file__).parent / "logs"
+if not log_root.exists():
+    log_root.mkdir()
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    filename=log_root / "stress_test.log",
+    filemode="w",
+)
+logger = logging.getLogger(__name__)
+
+
+def add_vertices(hg, num_vertices):
+    """Add multiple vertices to the hypergraph."""
+    start_time = time.time()
+    for i in range(1, num_vertices + 1):
+        hg.add_v(i, {"name": f"Vertex-{i}"})
+    end_time = time.time()
+    total_time = end_time - start_time
+    logger.info(f"Added {num_vertices} vertices in {total_time:.2f} seconds.")
+    return total_time
+
+
+def add_edges(hg, num_edges, max_vertices):
+    """Add multiple hyperedges to the hypergraph."""
+    start_time = time.time()
+    for _ in range(num_edges):
+        edge_size = random.randint(2, min(5, max_vertices))  # Each hyperedge contains 2 to 5 vertices
+        vertices = random.sample(range(1, max_vertices + 1), edge_size)
+        hg.add_e(tuple(vertices), {"relation": "random_edge"})
+    end_time = time.time()
+    total_time = end_time - start_time
+    logger.info(f"Added {num_edges} edges in {total_time:.2f} seconds.")
+    return total_time
+
+
+def query_vertices(hg, num_queries, max_vertices):
+    """Randomly query vertex data."""
+    start_time = time.time()
+    for _ in range(num_queries):
+        v_id = random.randint(1, max_vertices)
+        hg.v(v_id)
+    end_time = time.time()
+    total_time = end_time - start_time
+    logger.info(f"Queried {num_queries} vertices in {total_time:.2f} seconds.")
+    return total_time
+
+
+def query_edges(hg, num_queries, max_vertices):
+    """Randomly query hyperedge data."""
+    start_time = time.time()
+    for _ in range(num_queries):
+        edge_size = random.randint(2, min(5, max_vertices))
+        vertices = random.sample(range(1, max_vertices + 1), edge_size)
+        hg.e(tuple(vertices), default=None)
+    end_time = time.time()
+    total_time = end_time - start_time
+    logger.info(f"Queried {num_queries} edges in {total_time:.2f} seconds.")
+    return total_time
+
+
+def stress_test(num_vertices=5000, num_edges=1000, num_queries=2000, scale_factor=1):
+    """
+    Single-threaded stress test for HypergraphDB.
+    - Add vertices
+    - Add hyperedges
+    - Query vertices and hyperedges
+    """
+    hg = HypergraphDB()
+
+    # Adjust the number of vertices and edges based on the scale factor
+    vertices = num_vertices * scale_factor
+    edges = num_edges * scale_factor
+    queries = num_queries * scale_factor
+
+    logger.info(f"Starting stress test with scale factor {scale_factor}...")
+
+    # Step 1: Add vertices
+    vertex_add_time = add_vertices(hg, vertices)
+
+    # Step 2: Add hyperedges
+    edge_add_time = add_edges(hg, edges, vertices)
+
+    # Step 3: Query vertices
+    vertex_query_time = query_vertices(hg, queries, vertices)
+
+    # Step 4: Query hyperedges
+    edge_query_time = query_edges(hg, queries, vertices)
+
+    total_time = vertex_add_time + edge_add_time + vertex_query_time + edge_query_time
+
+    # Collect test results
+    result = {
+        "vertices_added": vertices,
+        "edges_added": edges,
+        "vertex_queries": queries,
+        "edge_queries": queries,
+        "vertex_add_time": vertex_add_time,
+        "edge_add_time": edge_add_time,
+        "vertex_query_time": vertex_query_time,
+        "edge_query_time": edge_query_time,
+        "total_time": total_time,
+    }
+
+    # Log test results
+    logger.info(f"Stress Test Completed for Scale Factor {scale_factor}:")
+    logger.info(f"  - Vertices added: {result['vertices_added']} in {result['vertex_add_time']:.2f} seconds")
+    logger.info(f"  - Edges added: {result['edges_added']} in {result['edge_add_time']:.2f} seconds")
+    logger.info(f"  - Vertex queries: {result['vertex_queries']} in {result['vertex_query_time']:.2f} seconds")
+    logger.info(f"  - Edge queries: {result['edge_queries']} in {result['edge_query_time']:.2f} seconds")
+    logger.info(f"  - Total Time: {result['total_time']:.2f} seconds")
+
+    # Return results for summary
+    return result
+
+
+def stress_increasing_scales_test():
+    """
+    Perform stress tests on hypergraphs with increasing scales.
+    """
+    scale_factors = [1, 2, 5, 10, 20, 50, 100, 200]  # Increasing scale factors
+    results = []
+
+    # Collect results for each scale factor
+    for scale in scale_factors:
+        logger.info(f"Starting stress test for scale factor: {scale}")
+        result = stress_test(num_vertices=5000, num_edges=1000, num_queries=2000, scale_factor=scale)
+        results.append(result)
+
+    # Output test results as a table
+    logger.info("\nSummary of Stress Test Results:\n")
+    logger.info(f"{'num v':<10}{'num e':<10}{'add v':<10}{'add e':<10}{'query v':<15}{'query e':<15}{'total time':<10}")
+    logger.info("-" * 80)
+    for result in results:
+        logger.info(
+            f"{result['vertices_added']:<10}"
+            f"{result['edges_added']:<10}"
+            f"{result['vertex_add_time']:<10.2f}"
+            f"{result['edge_add_time']:<10.2f}"
+            f"{result['vertex_query_time']:.2f}/{result['vertex_queries']:<10}"
+            f"{result['edge_query_time']:.2f}/{result['edge_queries']:<10}"
+            f"{result['total_time']:<10.2f}"
+        )
+
+
+if __name__ == "__main__":
+    stress_increasing_scales_test()
\ No newline at end of file