Add basic histogram rendering

kdouda · kdouda · commit 200329629274 · 2025-01-18T19:34:08.000+01:00
diff --git a/src/pyrdfrules/common/format/histogram.py b/src/pyrdfrules/common/format/histogram.py
@@ -0,0 +1,23 @@
+from typing import List, Tuple
+
+
+def draw_histogram(frequencies: dict[str, int], total: int|None = None) -> None:
+    from rich import print
+    from rich.table import Table
+
+    max_freq = max(frequencies.values()) if total is None else total
+    
+    table = Table(title="Histogram")
+
+    table.add_column("Instances")
+    table.add_column("Name", justify="left")
+    table.add_column("Chart")
+
+    
+    for key, freq in frequencies.items():
+        table.add_row(str(freq), key, f"[bold magenta]{'█' * int(40 * freq / max_freq)}[/]")
+        
+    print(table)
+
+    
+    pass
diff --git a/src/pyrdfrules/common/result/histogram.py b/src/pyrdfrules/common/result/histogram.py
@@ -0,0 +1,118 @@
+from typing import List, Optional
+from pydantic import BaseModel, PositiveInt
+
+from pyrdfrules.common.result.resultobject import ResultObject
+
+class HistogramSingleResult(ResultObject):
+    """
+    Histogram single result.
+    """
+    
+    amount: PositiveInt
+    """Histogram amount.
+    """
+    
+    object: Optional[str|dict|None] = None
+    """RDF object, if available.
+    """
+    
+    predicate: Optional[str|dict|None] = None
+    """RDF predicate, if available.
+    """
+    
+    subject: Optional[str|dict|None] = None
+    """RDF subject, if available.
+    """
+    
+    def get_histogram_name(self) -> str:
+        """Returns the name of the histogram item.
+        
+        Returns:
+            str: Name of the histogram item.
+        """
+        
+        parts = []
+        
+        if self.subject is not None:
+            parts.append(self.subject)
+        
+        if self.predicate is not None:
+            parts.append(self.predicate)
+        
+        if self.object is not None:
+            parts.append(self.object)
+
+        if len(parts) == 0:
+            return "Unknown"
+
+        return " - ".join(parts)
+    
+    pass
+
+class HistogramResult(BaseModel):
+    """
+    Histogram result collection.
+    
+    Attributes:
+        subject (bool): If True, the histogram is created for subjects.
+        predicate (bool): If True, the histogram is created for predicates.
+        object (bool): If True, the histogram is created for objects.
+    """
+    
+    list: List[HistogramSingleResult] = []
+    
+    def get_sorted(self, reverse = True) -> List[HistogramSingleResult]:
+        """Returns the histogram list sorted by amount.
+        
+        Args:
+            reverse (bool): If True, the list is sorted in descending order (most frequent first).
+        
+        Returns:
+            list[HistogramSingleResult]: Sorted histogram list.
+        """
+        
+        return sorted(self.list, key=lambda x: x.amount, reverse=reverse)
+    
+    def get_top(self, n: int) -> List[HistogramSingleResult]:
+        """Returns the top n elements from the histogram.
+        
+        Args:
+            n (int): Number of elements to return.
+        
+        Returns:
+            list[HistogramSingleResult]: Top n elements.
+        """
+        
+        return self.get_sorted()[:n]
+    
+    def get_bottom(self, n: int) -> List[HistogramSingleResult]:
+        """Returns the bottom n elements from the histogram.
+        
+        Args:
+            n (int): Number of elements to return.
+        
+        Returns:
+            list[HistogramSingleResult]: Bottom n elements.
+        """
+        
+        return self.get_sorted(reverse=False)[:n]
+    
+    def print(self, top_n: int = 10):
+        """Prints the histogram.
+        
+        Args:
+            top_n (int): Number of top elements to print.      
+        """
+        
+        from pyrdfrules.common.format.histogram import draw_histogram
+        
+        top = self.get_top(top_n)
+        
+        mapping = {}
+        
+        for item in top:
+            mapping[item.get_histogram_name()] = item.amount
+        
+        draw_histogram(mapping)
+    
+    pass
diff --git a/src/pyrdfrules/common/result/result.py b/src/pyrdfrules/common/result/result.py
@@ -2,10 +2,10 @@
 from typing import List
 from pyrdfrules.common.logging.logger import log
 from pyrdfrules.common.result.evaluation import Evaluation
+from pyrdfrules.common.result.histogram import HistogramResult, HistogramSingleResult
 from pyrdfrules.common.rule.resultrule import ResultRule
 from pyrdfrules.common.rule.ruleset import Ruleset
 
-
 class Result():
     """Class representing the result of a task.
     """
@@ -18,6 +18,8 @@ class Result():
     
     predictionTasks: List[dict] = None
     
+    histogram: HistogramResult = None
+    
     data: dict
     """Raw JSON response from RDFRules."""
     
@@ -40,6 +42,7 @@ def _parse_data(self):
         rules = []
         predictionTasks = []
         evaluate = []
+        histogram = []
         
         for item in self.data:
             log().debug(f"Parsing item: {str(item)}")
@@ -76,6 +79,11 @@ def _parse_data(self):
                     evaluate.append(Evaluation.model_validate(item))
                     pass
                 
+                case {'amount': _, 'subject': __, 'predicate': ___, 'object': ____}:
+                    # Item is a histogram
+                    histogram.append(HistogramSingleResult.model_validate(item))
+                    pass
+                
                 case _: 
                     log().debug(f"Unknown item: {str(item)}")
                     print("Unknown item")
@@ -84,6 +92,7 @@ def _parse_data(self):
         self.ruleset = Ruleset(rules = rules)
         self.predictionTasks = predictionTasks
         self.evaluate = evaluate
+        self.histogram = HistogramResult(list = histogram)
     
     def get_ruleset(self) -> Ruleset:
         """Returns the ruleset generated by RDFRules.
@@ -93,4 +102,9 @@ def get_ruleset(self) -> Ruleset:
     def get_evaluations(self) -> List[Evaluation]:
         """Returns the list of evaluations.
         """
-        return self.evaluate
+        return self.evaluate
+    
+    def get_histogram(self) -> HistogramResult:
+        """Returns the histogram.
+        """
+        return self.histogram
diff --git a/src/pyrdfrules/rdfrules/pipeline.py b/src/pyrdfrules/rdfrules/pipeline.py
@@ -213,9 +213,9 @@ class Properties(RDFRulesTaskModel):
 
 class Histogram(RDFRulesTaskModel):
     name: Literal["Histogram"] = "Histogram"
-    subject: bool
-    predicate: bool
-    object: bool
+    subject: Optional[bool] = None
+    predicate: Optional[bool] = None
+    object: Optional[bool] = None
 
 
 class LoadIndex(RDFRulesTaskModel):
diff --git a/src/tests/test_histogram.py b/src/tests/test_histogram.py
@@ -0,0 +1,157 @@
+"""
+[
+  {
+    "name": "LoadGraph",
+    "parameters": {
+      "path": "/data/wn18rr/train.tsv",
+      "settings": "tsvParsedUris"
+    }
+  },
+  {
+    "name": "Histogram",
+    "parameters": {
+      "subject": true,
+      "predicate": false,
+      "object": false
+    }
+  }
+]
+"""
+
+import time
+import unittest
+
+from pyrdfrules.common.http.url import Url
+
+import pyrdfrules.application
+from pyrdfrules.common.result.histogram import HistogramResult, HistogramSingleResult
+from pyrdfrules.common.task.task import Task
+from pyrdfrules.config import Config
+import os
+
+
+import os
+import time
+import unittest
+from unittest.mock import patch, MagicMock
+
+import requests
+import pyrdfrules
+from pyrdfrules.application import Application
+from pyrdfrules.common.result.result import Result
+from pyrdfrules.config import Config
+from pyrdfrules.config import Config
+from pyrdfrules.rdfrules.commondata import ConfidenceType, Constraint, RuleConsumer, RuleConsumerType, Threshold
+from pyrdfrules.rdfrules.jsonformats import PrefixFull
+from pyrdfrules.rdfrules.pipeline import ComputeConfidence, GetRules, GraphAwareRules, Histogram, Index, LoadGraph, MergeDatasets, AddPrefixes, Mine, Pipeline, SortRuleset
+
+def get_path(file_name):
+    return os.path.join(os.path.dirname((os.path.realpath(__file__))), "data", file_name)
+
+# slightly modified from
+# from https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests
+def download_file(url, file_name, base_path):
+    file_path = os.path.join(base_path, "dbpedia_yago", file_name)
+    
+    os.makedirs(os.path.join(base_path, "dbpedia_yago"), exist_ok=True)
+    
+    print(file_path)
+    # check if the file already exists
+    if os.path.exists(file_path):
+        return
+    
+    # NOTE the stream=True parameter below
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(file_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192): 
+                # If you have chunk encoded response uncomment if
+                # and set chunk_size parameter to None.
+                #if chunk: 
+                f.write(chunk)
+
+class TestHistogram(unittest.TestCase):
+    
+    def setUp(self):
+        # download the pipeline files
+        self.config = Config(
+            workspace_path=os.path.realpath(os.path.join(os.path.dirname((os.path.realpath(__file__))), "..", "rdfrules", "workspace"))
+        )
+        
+        download_file("http://rdfrules.vse.cz/api/workspace/dbpedia_yago/mappingbased_objects_sample.ttl", "mappingbased_objects_sample.ttl", self.config.workspace_path)
+
+        self.instance = app = pyrdfrules.application.Application()
+        
+        self.rdfrules = app.start_local(
+            install_jvm = True,
+            install_rdfrules = True,
+            config = self.config
+        )
+
+        return super().setUp()
+    
+    def tearDown(self):
+        self.instance.stop()
+        return super().tearDown()
+    
+    def test_histogram(self):
+        """
+        Runs a pipeline locally.
+        """
+
+        pipeline = Pipeline(
+            tasks=[
+                LoadGraph(
+                    graphName = "<dbpedia>",
+                    path = "/dbpedia_yago/mappingbased_objects_sample.ttl"
+                ),
+                Histogram(
+                    subject=True,
+                )
+            ]
+        )
+        
+        task = self.rdfrules.task.create_task(pipeline)
+            
+        for step in self.rdfrules.task.run_task(task):
+            print(step)
+            self.assertIsNotNone(step, "Should not be None")
+            self.assertIsInstance(step, Task, "Should be an instance of Task")
+        
+        self.assertIsNotNone(task.result, "Should not be None")
+        self.assertTrue(task.finished, "Should be finished")
+        
+        print(task.result)
+        
+        histogram = task.get_result().get_histogram()
+        
+        self.assertIsInstance(histogram, HistogramResult)
+        self.assertTrue(len(histogram.list) > 0)
+        
+        for item in histogram.list:
+            self.assertIsInstance(item, HistogramSingleResult)
+            
+        top_ten_items = histogram.get_top(10)
+        
+        self.assertTrue(len(top_ten_items) == 10)
+        
+        print(top_ten_items)
+        
+        histogram.print(top_n=10)
+        
+        #for eval in task.get_result().get_evaluations():
+        #    print(eval)
+        #    eval.print()
+        #
+        #self.assertIsNotNone(task.result, "Should not be None")
+        #self.assertIsInstance(task.result, list, "Should be a list")
+        #
+        #self.assertIsNotNone(task.get_result(), "Should not be None")
+        #self.assertIsInstance(task.get_result(), Result, "Should be a Result")
+
+if __name__ == '__main__':
+    unittest.main()
+
+
+if __name__ == '__main__':
+    unittest.main()