diff --git a/apps/docling_graph/assets/cytoscape-layout-forceatlas2.js b/apps/docling_graph/assets/cytoscape-layout-forceatlas2.js
new file mode 100644
index 0000000..9c2bdc5
--- /dev/null
+++ b/apps/docling_graph/assets/cytoscape-layout-forceatlas2.js
@@ -0,0 +1,60 @@
+/* Minimal ForceAtlas2-compatible layout wrapper for Cytoscape.
+ Uses COSE under the hood for stability when the real plugin bundle
+ is not available, while preserving the forceatlas2 layout name. */
+(function () {
+ if (typeof cytoscape === "undefined") {
+ return;
+ }
+
+ var defaults = {
+ iterations: 800,
+ scalingRatio: 1.0,
+ gravity: 1.0,
+ linLogMode: false,
+ preventOverlap: true,
+ fit: true,
+ padding: 30,
+ animate: false
+ };
+
+ function ForceAtlas2Layout(options) {
+ this.options = Object.assign({}, defaults, options);
+ this.cy = options.cy;
+ }
+
+ ForceAtlas2Layout.prototype.run = function () {
+ var opts = this.options;
+ if (!this.cy) {
+ return;
+ }
+
+ var scaling = Math.max(0.5, opts.scalingRatio || 1.0);
+ var layout = this.cy.layout({
+ name: "cose",
+ animate: opts.animate,
+ randomize: false,
+ fit: opts.fit,
+ padding: opts.padding,
+ gravity: opts.gravity,
+ nodeRepulsion: 2048 * scaling,
+ idealEdgeLength: 50 * scaling,
+ avoidOverlap: opts.preventOverlap,
+ numIter: opts.iterations
+ });
+
+ this._layout = layout;
+ layout.run();
+ };
+
+ ForceAtlas2Layout.prototype.stop = function () {
+ if (this._layout && typeof this._layout.stop === "function") {
+ this._layout.stop();
+ }
+ };
+
+ cytoscape("layout", "forceatlas2", ForceAtlas2Layout);
+
+ if (typeof window !== "undefined") {
+ window.cytoscapeLayoutForceatlas2 = ForceAtlas2Layout;
+ }
+})();
diff --git a/apps/docling_graph/assets/forceatlas2-init.js b/apps/docling_graph/assets/forceatlas2-init.js
new file mode 100644
index 0000000..307c4b2
--- /dev/null
+++ b/apps/docling_graph/assets/forceatlas2-init.js
@@ -0,0 +1,34 @@
+// Register ForceAtlas2 layout plugin if present.
+// Requires the plugin file at:
+// apps/docling_graph/assets/cytoscape-layout-forceatlas2.js
+
+(function () {
+ function tryRegister() {
+ var cy = window.cytoscape;
+ if (!cy) return false;
+
+ var fa2 = window.cytoscapeLayoutForceatlas2 || window.forceatlas2;
+
+ if (fa2 && typeof cy.use === "function") {
+ try {
+ cy.use(fa2);
+ console.log("[docling_graph] ForceAtlas2 plugin registered");
+ return true;
+ } catch (e) {
+ console.warn("[docling_graph] ForceAtlas2 plugin present but failed to register", e);
+ }
+ }
+ return false;
+ }
+
+ var attempts = 0;
+ var timer = setInterval(function () {
+ attempts += 1;
+ if (tryRegister() || attempts >= 20) {
+ clearInterval(timer);
+ if (attempts >= 20) {
+ console.warn("[docling_graph] ForceAtlas2 plugin not detected (layout will fail if selected)");
+ }
+ }
+ }, 250);
+})();
diff --git a/apps/docling_graph/assets/theme.css b/apps/docling_graph/assets/theme.css
index 157d9dc..2ff3f2f 100644
--- a/apps/docling_graph/assets/theme.css
+++ b/apps/docling_graph/assets/theme.css
@@ -1,38 +1,30 @@
-html, body {
- background: #121212;
- color: #eaeaea;
-}
-
-.app-root {
- display: flex;
- background: #121212;
-}
-
-.sidebar {
- width: 340px;
- background: #1b1b1b;
- padding: 12px;
- border-right: 1px solid #333;
-}
-
-label {
- color: #ccc;
-}
-
-.Select-control,
-.Select-menu-outer {
- background: #222;
- color: #fff;
+:root {
+ --gc-bg: #0b0f17;
+ --gc-panel-bg: #0f172a;
+ --gc-panel-border: #111827;
+ --gc-text: #e5e7eb;
+ --gc-muted: #94a3b8;
+ --gc-accent: #7c3aed;
+ --gc-graph-bg: #0b0f17;
+ --gc-button-bg: #111827;
+ --gc-button-border: #1f2937;
}
-.rc-slider-track {
- background-color: #9b4dff;
+@media (prefers-color-scheme: light) {
+ :root {
+ --gc-bg: #f8fafc;
+ --gc-panel-bg: #ffffff;
+ --gc-panel-border: #e2e8f0;
+ --gc-text: #0f172a;
+ --gc-muted: #475569;
+ --gc-accent: #7c3aed;
+ --gc-graph-bg: #ffffff;
+ --gc-button-bg: #f1f5f9;
+ --gc-button-border: #cbd5f5;
+ }
}
-.rc-slider-handle {
- border-color: #9b4dff;
-}
-
-.rc-slider-rail {
- background-color: #333;
+html, body {
+ background: var(--gc-bg);
+ color: var(--gc-text);
}
diff --git a/apps/docling_graph/assets/view_options.css b/apps/docling_graph/assets/view_options.css
index d068d5f..3bec418 100644
--- a/apps/docling_graph/assets/view_options.css
+++ b/apps/docling_graph/assets/view_options.css
@@ -1,83 +1,73 @@
-/* view_options.css
- Adds "skeleton-lite" grid to match dash-cytoscape demo layout
- while keeping your dark GraphCommons-style panel.
-*/
-
-/* ---------------------------
- Skeleton-lite grid (demo compatibility)
- The demo uses:
/ "four columns" / "row"
-----------------------------*/
-.row {
- display: flex;
- flex-wrap: wrap;
- align-items: stretch;
- width: 100%;
- gap: 0;
+/* Layout */
+.docling-app {
+ background: var(--gc-bg);
+ color: var(--gc-text);
+ min-height: 100vh;
+ padding: 12px;
}
-.columns {
- box-sizing: border-box;
- padding: 0;
- min-width: 0;
+.app-grid {
+ display: grid;
+ grid-template-columns: 320px 1fr 340px;
+ gap: 12px;
}
-/* 12-col grid: 8/12 and 4/12 */
-.eight.columns { flex: 0 0 66.6666%; max-width: 66.6666%; }
-.four.columns { flex: 0 0 33.3333%; max-width: 33.3333%; }
-
-/* Responsive: stack on small screens */
-@media (max-width: 980px) {
- .eight.columns, .four.columns { flex: 0 0 100%; max-width: 100%; }
+@media (max-width: 1200px) {
+ .app-grid {
+ grid-template-columns: 1fr;
+ }
}
-/* Add spacing so the right panel isn’t flush */
-.eight.columns { padding-right: 12px; }
-.four.columns { padding-left: 12px; }
+.panel-left,
+.panel-right {
+ background: var(--gc-panel-bg);
+ border: 1px solid var(--gc-panel-border);
+ border-radius: 12px;
+ padding: 14px;
+ box-shadow: 0 10px 20px rgba(0, 0, 0, 0.2);
+}
-/* ---------------------------
- Your existing dark UI
-----------------------------*/
-.app {
- background: #0b0f17;
- color: #e5e7eb;
- min-height: 100vh;
- padding: 10px;
+.panel-main {
+ background: var(--gc-panel-bg);
+ border: 1px solid var(--gc-panel-border);
+ border-radius: 12px;
+ padding: 8px;
}
-.topbar {
- display: flex;
- justify-content: space-between;
- align-items: center;
+.panel-title {
+ font-size: 16px;
+ font-weight: 700;
margin-bottom: 10px;
}
-.title {
- font-size: 22px;
- font-weight: 700;
+.control-section {
+ margin-top: 12px;
+ padding-top: 12px;
+ border-top: 1px solid var(--gc-panel-border);
}
-.controls {
- display: flex;
- gap: 14px;
- flex-wrap: wrap;
- align-items: center;
- margin-bottom: 10px;
+.control-label {
+ font-size: 12px;
+ letter-spacing: 0.3px;
+ text-transform: uppercase;
+ color: var(--gc-muted);
+ margin-bottom: 6px;
}
.btn {
- background: #111827;
- color: #e5e7eb;
- border: 1px solid #1f2937;
+ background: var(--gc-button-bg);
+ color: var(--gc-text);
+ border: 1px solid var(--gc-button-border);
padding: 8px 10px;
border-radius: 10px;
cursor: pointer;
+ margin-right: 6px;
+ margin-top: 6px;
}
-.btn:hover { border-color: #334155; }
-
.btn-primary {
- background: #7c3aed; /* purple accent to match demo vibe */
- color: white;
+ background: var(--gc-accent);
+ color: #fff;
border: 1px solid #6d28d9;
padding: 10px 12px;
border-radius: 10px;
@@ -85,204 +75,113 @@
width: 100%;
}
-.btn-close {
- background: transparent;
- border: 1px solid #1f2937;
- color: #e5e7eb;
- border-radius: 10px;
- padding: 4px 10px;
- cursor: pointer;
-}
-
.btn-small {
- background: #111827;
- color: #e5e7eb;
- border: 1px solid #1f2937;
+ background: var(--gc-button-bg);
+ color: var(--gc-text);
+ border: 1px solid var(--gc-button-border);
padding: 4px 10px;
border-radius: 8px;
cursor: pointer;
- margin-left: 6px;
-}
-
-.panel {
- position: fixed;
- top: 0;
- right: 0;
- width: 360px;
- height: 100vh;
- background: #0f172a;
- border-left: 1px solid #1f2937;
- box-shadow: -10px 0 40px rgba(0,0,0,0.35);
- padding: 14px 14px 18px;
- z-index: 9999;
- overflow-y: auto;
+ margin-top: 6px;
}
-.hidden { display: none; }
-
-.panel-header {
+.inspector-panel {
display: flex;
- justify-content: space-between;
- align-items: center;
- margin-bottom: 10px;
+ flex-direction: column;
+ gap: 10px;
}
-.panel-title {
+.inspector-title {
font-size: 18px;
font-weight: 700;
}
-.panel-section {
- border-top: 1px solid #1f2937;
- padding-top: 10px;
- margin-top: 10px;
-}
-
-.section-title {
- font-size: 12px;
- opacity: 0.85;
- margin-bottom: 6px;
- font-weight: 600;
-}
-
-.muted {
- opacity: 0.85;
+.inspector-subtitle {
font-size: 12px;
+ text-transform: uppercase;
+ color: var(--gc-muted);
}
-.panel-footer {
- margin-top: 14px;
- border-top: 1px solid #1f2937;
- padding-top: 12px;
-}
-
-/* ---------------------------
- Control panel (dark, inverted demo)
-----------------------------*/
-.control-tabs {
- background: #0b0f17;
- border-radius: 12px;
- box-shadow: inset 0 0 0 1px #111827, 0 10px 30px rgba(0, 0, 0, 0.35);
- overflow: hidden;
-}
-
-.control-tab {
- background: #0b0f17;
- color: #cbd5e1;
- border: none;
- padding: 0;
-}
-
-.control-tab--selected {
- background: #0f172a !important;
- border-bottom: 2px solid #7c3aed !important;
- color: #e5e7eb !important;
-}
-
-.control-panel {
- background: linear-gradient(135deg, #0f172a, #0b0f17);
- border: 1px solid #111827;
- border-radius: 12px;
- padding: 14px 14px 18px;
- color: #e5e7eb;
- min-height: 80vh;
- box-shadow: 0 14px 30px rgba(0, 0, 0, 0.45);
-}
-
-.control-panel--secondary {
- min-height: unset;
-}
-
-.control-panel__header {
- display: flex;
- align-items: center;
- justify-content: space-between;
- margin-bottom: 10px;
+.inspector-description {
+ font-size: 13px;
+ color: var(--gc-text);
+ white-space: pre-wrap;
}
-.control-title {
- font-size: 15px;
- font-weight: 700;
+.inspector-group {
+ border-top: 1px solid var(--gc-panel-border);
+ padding-top: 8px;
}
-.pill {
- border-radius: 999px;
- padding: 3px 10px;
- font-size: 11px;
- letter-spacing: 0.3px;
- text-transform: uppercase;
- background: #111827;
- border: 1px solid #1f2937;
- color: #cbd5e1;
+.inspector-group-title {
+ font-size: 12px;
+ color: var(--gc-muted);
+ margin-bottom: 4px;
}
-.pill--invert {
- background: #c7d2fe;
- color: #0b0f17;
- border-color: #a5b4fc;
+.inspector-item {
+ font-size: 12px;
+ padding: 2px 0;
}
-.control-section {
+.debug-panel {
margin-top: 12px;
- padding-top: 12px;
- border-top: 1px solid #111827;
-}
-
-.control-section--tight {
- padding-top: 6px;
+ background: var(--gc-panel-bg);
+ border: 1px solid var(--gc-panel-border);
+ border-radius: 12px;
+ padding: 12px;
}
-.control-label {
- font-size: 12px;
- letter-spacing: 0.3px;
- text-transform: uppercase;
- color: #94a3b8;
- margin-bottom: 6px;
+.debug-output {
+ max-height: 200px;
+ overflow-y: auto;
+ background: rgba(15, 23, 42, 0.4);
+ padding: 8px;
+ border-radius: 8px;
+ color: var(--gc-text);
}
-.control-subtext {
- margin-top: 8px;
+.muted {
+ color: var(--gc-muted);
font-size: 12px;
- color: #cbd5e1;
- opacity: 0.85;
}
.Select-control,
.Select-menu-outer {
- background: #0f172a;
- border-color: #1f2937;
- color: #e5e7eb;
+ background: var(--gc-panel-bg);
+ border-color: var(--gc-panel-border);
+ color: var(--gc-text);
}
-.control-panel .Select-value-label {
- color: #e5e7eb !important;
+.control-panel .Select-value-label,
+.panel-left .Select-value-label {
+ color: var(--gc-text) !important;
}
-.control-panel .rc-slider-track {
- background-color: #7c3aed;
+.control-panel .rc-slider-track,
+.panel-left .rc-slider-track {
+ background-color: var(--gc-accent);
}
-.control-panel .rc-slider-handle {
- border-color: #7c3aed;
- background-color: #0f172a;
+.control-panel .rc-slider-handle,
+.panel-left .rc-slider-handle {
+ border-color: var(--gc-accent);
+ background-color: var(--gc-panel-bg);
box-shadow: 0 0 0 2px rgba(124, 58, 237, 0.2);
}
-.control-panel .rc-slider-rail {
- background-color: #1f2937;
-}
-
-.control-panel .rc-slider-mark-text {
- color: #94a3b8;
+.control-panel .rc-slider-rail,
+.panel-left .rc-slider-rail {
+ background-color: var(--gc-panel-border);
}
.control-radio,
.control-checkbox {
- accent-color: #7c3aed;
+ accent-color: var(--gc-accent);
margin-right: 8px;
}
.control-radio__label,
.control-checkbox__label {
- color: #e5e7eb;
+ color: var(--gc-text);
}
diff --git a/apps/docling_graph/graph_builder.py b/apps/docling_graph/graph_builder.py
index 502eff0..8c03f12 100644
--- a/apps/docling_graph/graph_builder.py
+++ b/apps/docling_graph/graph_builder.py
@@ -1,14 +1,14 @@
from __future__ import annotations
-import hashlib
import json
+import math
import os
import tempfile
import unittest
-from collections.abc import Iterable
+from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
DOCLING_JSON_ROOT = "/home/hp/docling-ws/data/docling"
@@ -28,6 +28,16 @@
"decorative",
}
+EDGE_CONTAINS = "CONTAINS"
+EDGE_HAS_PAGE = "HAS_PAGE"
+EDGE_HAS_BODY = "HAS_BODY"
+EDGE_NEXT = "NEXT"
+EDGE_ON_PAGE = "ON_PAGE"
+
+NODE_DOCUMENT = "Document"
+NODE_PAGE = "Page"
+NODE_TEXT = "Text"
+
# -----------------------------
# Graph contract
@@ -41,8 +51,19 @@ class GraphPayload:
# -----------------------------
# Helpers
# -----------------------------
-def _nid(value: str) -> str:
- return "n_" + hashlib.md5(value.encode("utf-8")).hexdigest()[:12]
+
+def _node_id(node_type: str, name: str) -> str:
+ return f"{node_type}::{name}"
+
+
+def _edge_id(
+ from_type: str,
+ from_name: str,
+ edge_type: str,
+ to_type: str,
+ to_name: str,
+) -> str:
+ return f"{from_type}::{from_name}::{edge_type}::{to_type}::{to_name}"
def _load_json(path: str) -> Dict[str, Any]:
@@ -64,17 +85,6 @@ def _short(text: str, limit: int = 120) -> str:
return t[:limit] + ("…" if len(t) > limit else "")
-def list_docling_files(json_root: Optional[str] = None) -> List[str]:
- results: List[str] = []
- search_root = json_root or DOCLING_JSON_ROOT
-
- for root, _, files in os.walk(search_root):
- for name in files:
- if name.lower().endswith(".json"):
- results.append(os.path.join(root, name))
- return sorted(results)
-
-
def _is_noise_text(label: str, text: str) -> bool:
lbl = (label or "").strip().lower()
t = (text or "").strip()
@@ -88,9 +98,91 @@ def _is_noise_text(label: str, text: str) -> bool:
return False
+def _clamp(value: float, min_value: float, max_value: float) -> float:
+ return max(min_value, min(value, max_value))
+
+
+def _resolve_pointer(doc: Any, pointer: str) -> Any:
+ current = doc
+ for part in pointer.split("/"):
+ part = part.replace("~1", "/").replace("~0", "~")
+ if isinstance(current, list):
+ try:
+ idx = int(part)
+ except ValueError:
+ return None
+ if idx >= len(current):
+ return None
+ current = current[idx]
+ elif isinstance(current, dict):
+ if part not in current:
+ return None
+ current = current[part]
+ else:
+ return None
+ return current
+
+
+def resolve_refs(obj: Any, root: Any, seen: Optional[Dict[int, Any]] = None) -> Any:
+ if seen is None:
+ seen = {}
+
+ obj_id = id(obj)
+ if obj_id in seen:
+ return seen[obj_id]
+
+ if isinstance(obj, dict):
+ if "$ref" in obj and isinstance(obj["$ref"], str):
+ ref = obj["$ref"]
+ if ref.startswith("#/"):
+ resolved = _resolve_pointer(root, ref[2:])
+ if resolved is not None:
+ resolved_value = resolve_refs(resolved, root, seen)
+ if len(obj) == 1:
+ return resolved_value
+ merged = {
+ **(resolved_value if isinstance(resolved_value, dict) else {}),
+ **{k: v for k, v in obj.items() if k != "$ref"},
+ }
+ return resolve_refs(merged, root, seen)
+ resolved_dict = {}
+ seen[obj_id] = resolved_dict
+ for key, value in obj.items():
+ resolved_dict[key] = resolve_refs(value, root, seen)
+ return resolved_dict
+
+ if isinstance(obj, list):
+ resolved_list: List[Any] = []
+ seen[obj_id] = resolved_list
+ for item in obj:
+ resolved_list.append(resolve_refs(item, root, seen))
+ return resolved_list
+
+ return obj
+
+
+def list_docling_files(json_root: Optional[str] = None) -> List[str]:
+ results: List[str] = []
+ search_root = json_root or DOCLING_JSON_ROOT
+
+ for root, _, files in os.walk(search_root):
+ for name in files:
+ if name.lower().endswith(".json"):
+ results.append(os.path.join(root, name))
+ return sorted(results)
+
+
# -----------------------------
# Core builder
# -----------------------------
+
+def _collect_text_items(doc: Dict[str, Any]) -> List[Dict[str, Any]]:
+ texts = doc.get("texts")
+ if isinstance(texts, list):
+ return [t for t in texts if isinstance(t, dict)]
+ return []
+
+
def _collect_pages_from_texts(texts: Iterable[Any]) -> Dict[int, List[Dict[str, Any]]]:
pages: Dict[int, List[Dict[str, Any]]] = {}
@@ -116,112 +208,208 @@ def _collect_pages_from_texts(texts: Iterable[Any]) -> Dict[int, List[Dict[str,
return pages
-def build_graph_from_docling_json(path: str) -> GraphPayload:
- """
- Graph shape:
- DOCUMENT → PAGE → TEXT
+def _bucket_text_length(text: str) -> int:
+ if not text:
+ return 1
+ return int(_clamp(math.ceil(len(text) / 200), 1, 10))
+
+
+def _compute_node_weights(nodes: List[Dict[str, Any]], edges: List[Dict[str, Any]]) -> None:
+ outgoing = defaultdict(list)
+ incoming = defaultdict(list)
+
+ for edge in edges:
+ data = edge.get("data", {})
+ source = data.get("source")
+ target = data.get("target")
+ if source:
+ outgoing[source].append(edge)
+ if target:
+ incoming[target].append(edge)
+
+ for node in nodes:
+ data = node.get("data", {})
+ node_id = data.get("id")
+ description = data.get("description", "") or ""
+ text_score = min(10, math.ceil(len(description) / 200)) if description else 0
+ children_score = len(outgoing.get(node_id, []))
+ degree_score = len(outgoing.get(node_id, [])) + len(incoming.get(node_id, []))
+ weight = 1 + children_score + text_score + (degree_score * 0.5)
+ data["weight"] = round(weight, 2)
+ data["size"] = round(_clamp(20 + weight * 3, 20, 80), 2)
+
+
+def _compute_edge_weights(edges: List[Dict[str, Any]], node_lookup: Dict[str, Dict[str, Any]]) -> None:
+ for edge in edges:
+ data = edge.get("data", {})
+ edge_type = data.get("type")
+ weight = 1
+ if edge_type == EDGE_CONTAINS:
+ target = node_lookup.get(data.get("target"), {})
+ description = target.get("data", {}).get("description", "")
+ weight = _bucket_text_length(description)
+ data["weight"] = weight
+ data["width"] = round(_clamp(1 + weight * 0.7, 1, 8), 2)
+
+
+def _normalize_doc_root(doc: Dict[str, Any]) -> Dict[str, Any]:
+ if isinstance(doc, dict):
+ for key in ("document", "doc"):
+ candidate = doc.get(key)
+ if isinstance(candidate, dict):
+ return candidate
+ return doc
+
+
+def _build_graph(doc: Dict[str, Any], doc_name: str) -> GraphPayload:
+ resolved_doc = resolve_refs(_normalize_doc_root(doc), doc)
+
+ nodes_by_id: Dict[str, Dict[str, Any]] = {}
+ edges_by_id: Dict[str, Dict[str, Any]] = {}
+
+ doc_name = doc_name or resolved_doc.get("title") or "Docling Document"
+ doc_node_id = _node_id(NODE_DOCUMENT, doc_name)
+ nodes_by_id[doc_node_id] = {
+ "data": {
+ "id": doc_node_id,
+ "label": doc_name,
+ "type": NODE_DOCUMENT,
+ "description": resolved_doc.get("title") or doc_name,
+ "weight": 1,
+ "size": 20,
+ },
+ "classes": "node-type-document",
+ }
+
+ texts = _collect_text_items(resolved_doc)
+ pages = _collect_pages_from_texts(texts)
+
+ page_ids: Dict[int, str] = {}
- Nodes and edges are returned separately
- for Cytoscape stability and incremental expansion.
- """
- doc = _load_json(path)
-
- nodes: List[Dict[str, Any]] = []
- edges: List[Dict[str, Any]] = []
-
- doc_id = _nid(path)
- doc_name = os.path.basename(path)
+ for page_no in sorted(pages.keys()):
+ page_name = f"Page {page_no}"
+ page_id = _node_id(NODE_PAGE, page_name)
+ page_ids[page_no] = page_id
+ nodes_by_id[page_id] = {
+ "data": {
+ "id": page_id,
+ "label": page_name,
+ "type": NODE_PAGE,
+ "description": f"Page {page_no} of {doc_name}",
+ "weight": 1,
+ "size": 20,
+ "page": page_no,
+ },
+ "classes": "node-type-page",
+ }
- # DOCUMENT node
- nodes.append(
- {
+ edge_id = _edge_id(NODE_DOCUMENT, doc_name, EDGE_HAS_PAGE, NODE_PAGE, page_name)
+ edges_by_id[edge_id] = {
"data": {
- "id": doc_id,
- "type": "document",
- "label_short": f"DOCUMENT: {doc_name}",
- "label_full": f"DOCUMENT: {doc_name}",
- "label": f"DOCUMENT: {doc_name}",
+ "id": edge_id,
+ "source": doc_node_id,
+ "target": page_id,
+ "type": EDGE_HAS_PAGE,
},
- "classes": "document",
+ "classes": "edge-type-has-page",
}
- )
- pages = _collect_pages_from_texts(doc.get("texts"))
+ sorted_pages = sorted(page_ids.items())
+ for index, (page_no, page_id) in enumerate(sorted_pages[:-1]):
+ next_page_no, next_page_id = sorted_pages[index + 1]
+ from_name = f"Page {page_no}"
+ to_name = f"Page {next_page_no}"
+ edge_id = _edge_id(NODE_PAGE, from_name, EDGE_NEXT, NODE_PAGE, to_name)
+ edges_by_id[edge_id] = {
+ "data": {
+ "id": edge_id,
+ "source": page_id,
+ "target": next_page_id,
+ "type": EDGE_NEXT,
+ },
+ "classes": "edge-type-next",
+ }
- # PAGE + TEXT nodes
for page_no in sorted(pages.keys()):
- page_id = _nid(f"{path}::page::{page_no}")
+ page_id = page_ids[page_no]
+ page_name = f"Page {page_no}"
+ page_texts = pages[page_no][:MAX_TEXTS_PER_PAGE]
+
+ for idx, t in enumerate(page_texts, start=1):
+ raw_text = str(t.get("text") or "")
+ label = str(t.get("label") or "text").strip() or "text"
+ name = f"p{page_no}-{idx}: {_short(raw_text, 80)}"
+ text_id = _node_id(NODE_TEXT, name)
- nodes.append(
- {
+ nodes_by_id[text_id] = {
"data": {
- "id": page_id,
- "type": "chunk",
+ "id": text_id,
+ "label": name,
+ "type": NODE_TEXT,
+ "description": raw_text,
+ "weight": 1,
+ "size": 20,
"page": page_no,
- "label_short": f"PAGE {page_no}",
- "label_full": f"PAGE {page_no}",
- "label": f"PAGE {page_no}",
+ "label_type": label,
},
- "classes": "section",
+ "classes": "node-type-text",
}
- )
- edges.append(
- {
+ contains_id = _edge_id(NODE_PAGE, page_name, EDGE_CONTAINS, NODE_TEXT, name)
+ edges_by_id[contains_id] = {
"data": {
- "id": _nid(f"{doc_id}__{page_id}"),
- "source": doc_id,
+ "id": contains_id,
+ "source": page_id,
+ "target": text_id,
+ "type": EDGE_CONTAINS,
+ },
+ "classes": "edge-type-contains",
+ }
+
+ on_page_id = _edge_id(NODE_TEXT, name, EDGE_ON_PAGE, NODE_PAGE, page_name)
+ edges_by_id[on_page_id] = {
+ "data": {
+ "id": on_page_id,
+ "source": text_id,
"target": page_id,
- "rel": "hier",
+ "type": EDGE_ON_PAGE,
},
- "classes": "hier",
+ "classes": "edge-type-on-page",
}
- )
- page_texts = pages[page_no][:MAX_TEXTS_PER_PAGE]
+ has_body_id = _edge_id(NODE_DOCUMENT, doc_name, EDGE_HAS_BODY, NODE_TEXT, name)
+ edges_by_id[has_body_id] = {
+ "data": {
+ "id": has_body_id,
+ "source": doc_node_id,
+ "target": text_id,
+ "type": EDGE_HAS_BODY,
+ },
+ "classes": "edge-type-has-body",
+ }
- for t in page_texts:
- ref = t.get("self_ref") or t.get("id") or repr(t)
- text_id = _nid(ref)
+ nodes = list(nodes_by_id.values())
+ edges = list(edges_by_id.values())
+ node_lookup = {node["data"]["id"]: node for node in nodes}
- raw_text = str(t.get("text") or "")
- dtype = str(t.get("label") or "text").upper()
- content_layer = t.get("content_layer")
- _, bbox = _first_prov(t)
+ _compute_edge_weights(edges, node_lookup)
+ _compute_node_weights(nodes, edges)
- label_full = f"{dtype}: {raw_text}"
- label_short = f"{dtype}: {_short(raw_text, 140)}"
+ nodes = sorted(nodes, key=lambda n: n.get("data", {}).get("id", ""))
+ edges = sorted(edges, key=lambda e: e.get("data", {}).get("id", ""))
- nodes.append(
- {
- "data": {
- "id": text_id,
- "type": "text",
- "content_layer": content_layer,
- "text": raw_text,
- "page": page_no,
- "bbox": bbox,
- "label_short": label_short,
- "label_full": label_full,
- "label": label_full,
- },
- "classes": "item",
- }
- )
+ return GraphPayload(nodes=nodes, edges=edges)
- edges.append(
- {
- "data": {
- "id": _nid(f"{page_id}__{text_id}"),
- "source": page_id,
- "target": text_id,
- "rel": "hier",
- },
- "classes": "hier",
- }
- )
- return GraphPayload(nodes=nodes, edges=edges)
+def build_graph_from_docling_json(path: str) -> GraphPayload:
+ doc = _load_json(path)
+ doc_name = os.path.basename(path)
+ return _build_graph(doc, doc_name)
+
+
+def build_elements_from_docling_json(doc: Dict[str, Any], doc_name: str | None = None) -> List[Dict[str, Any]]:
+ graph = _build_graph(doc, doc_name or doc.get("title") or "Docling Document")
+ return graph.nodes + graph.edges
# -----------------------------
@@ -252,13 +440,14 @@ def test_collect_pages_from_texts_filters_invalid_entries(self):
def test_build_graph_from_docling_json_emits_document_page_and_text(self):
doc = {
+ "title": "Example",
"texts": [
{
"label": "body",
"text": "paragraph " * 10,
"prov": [{"page_no": 1, "bbox": [0, 0, 10, 10]}],
}
- ]
+ ],
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
@@ -270,19 +459,16 @@ def test_build_graph_from_docling_json_emits_document_page_and_text(self):
finally:
Path(tmp_path).unlink(missing_ok=True)
- self.assertEqual(len(payload.nodes), 3)
- self.assertEqual(len(payload.edges), 2)
-
- node_ids = {n["data"]["id"] for n in payload.nodes}
- edge_pairs = {(e["data"]["source"], e["data"]["target"]) for e in payload.edges}
+ node_types = {n["data"]["type"] for n in payload.nodes}
+ edge_types = {e["data"]["type"] for e in payload.edges}
- doc_id = _nid(tmp_path)
- page_id = _nid(f"{tmp_path}::page::1")
- text_id = next(n["data"]["id"] for n in payload.nodes if n["data"]["type"] == "text")
-
- self.assertIn(doc_id, node_ids)
- self.assertIn(page_id, node_ids)
- self.assertEqual(edge_pairs, {(doc_id, page_id), (page_id, text_id)})
+ self.assertIn(NODE_DOCUMENT, node_types)
+ self.assertIn(NODE_PAGE, node_types)
+ self.assertIn(NODE_TEXT, node_types)
+ self.assertIn(EDGE_CONTAINS, edge_types)
+ self.assertIn(EDGE_HAS_PAGE, edge_types)
+ self.assertIn(EDGE_HAS_BODY, edge_types)
+ self.assertIn(EDGE_ON_PAGE, edge_types)
def test_list_docling_files_accepts_custom_root(self):
with tempfile.TemporaryDirectory() as tmpdir:
@@ -298,6 +484,11 @@ def test_list_docling_files_accepts_custom_root(self):
self.assertEqual(results, sorted([str(first), str(second)]))
+ def test_resolve_refs_inlines_json_pointer(self):
+ doc = {"texts": [{"label": "body", "text": "abc"}], "ref": {"$ref": "#/texts/0"}}
+ resolved = resolve_refs(doc, doc)
+ self.assertEqual(resolved["ref"], {"label": "body", "text": "abc"})
+
if __name__ == "__main__":
unittest.main()
diff --git a/apps/docling_graph/graph_styles.py b/apps/docling_graph/graph_styles.py
new file mode 100644
index 0000000..e079101
--- /dev/null
+++ b/apps/docling_graph/graph_styles.py
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import hashlib
+from typing import Any, Dict, Iterable, List, Tuple
+
+
+NODE_PALETTE = [
+ "#38BDF8",
+ "#A78BFA",
+ "#F472B6",
+ "#34D399",
+ "#FBBF24",
+ "#60A5FA",
+ "#FB7185",
+ "#4ADE80",
+ "#F59E0B",
+ "#22D3EE",
+]
+
+EDGE_PALETTE = [
+ "#94A3B8",
+ "#A78BFA",
+ "#F472B6",
+ "#34D399",
+ "#FBBF24",
+ "#60A5FA",
+]
+
+THEMES = {
+ "dark": {
+ "bg": "#0B0F17",
+ "panel": "#0F172A",
+ "text": "#E5E7EB",
+ "muted": "#94A3B8",
+ "outline": "#0B1220",
+ "edge": "#64748B",
+ "edge_label": "#CBD5E1",
+ "selection": "#FBBF24",
+ "dim": 0.15,
+ },
+ "light": {
+ "bg": "#F8FAFC",
+ "panel": "#FFFFFF",
+ "text": "#0F172A",
+ "muted": "#475569",
+ "outline": "#E2E8F0",
+ "edge": "#64748B",
+ "edge_label": "#334155",
+ "selection": "#F59E0B",
+ "dim": 0.2,
+ },
+}
+
+
+def _slug(value: str) -> str:
+ return "".join(ch if ch.isalnum() else "-" for ch in value.strip().lower()).strip("-")
+
+
+def _color_for_type(type_name: str, palette: List[str]) -> str:
+ digest = hashlib.md5(type_name.encode("utf-8")).hexdigest()
+ index = int(digest[:8], 16) % len(palette)
+ return palette[index]
+
+
+def apply_theme_to_elements(
+ nodes: Iterable[Dict[str, Any]],
+ edges: Iterable[Dict[str, Any]],
+ theme: str,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ themed_nodes = []
+ themed_edges = []
+ node_color_cache: Dict[str, str] = {}
+ edge_color_cache: Dict[str, str] = {}
+
+ for node in nodes:
+ data = node.get("data", {})
+ node_type = data.get("type", "")
+ if node_type not in node_color_cache:
+ node_color_cache[node_type] = _color_for_type(node_type, NODE_PALETTE)
+ color = node_color_cache[node_type]
+ themed_node = {**node, "data": {**data, "color": color}}
+ themed_node["classes"] = f"node-type-{_slug(node_type)}"
+ themed_nodes.append(themed_node)
+
+ for edge in edges:
+ data = edge.get("data", {})
+ edge_type = data.get("type", "")
+ if edge_type not in edge_color_cache:
+ edge_color_cache[edge_type] = _color_for_type(edge_type, EDGE_PALETTE)
+ color = edge_color_cache[edge_type]
+ themed_edge = {**edge, "data": {**data, "color": color}}
+ themed_edge["classes"] = f"edge-type-{_slug(edge_type)}"
+ themed_edges.append(themed_edge)
+
+ return themed_nodes, themed_edges
+
+
+def base_stylesheet(
+ theme: str,
+ scale_node_size: bool,
+ scale_edge_width: bool,
+ show_edge_labels: bool,
+ show_arrows: bool,
+) -> List[Dict[str, Any]]:
+ tokens = THEMES.get(theme or "", THEMES["dark"])
+
+ node_size = "data(size)" if scale_node_size else "32px"
+ edge_width = "data(width)" if scale_edge_width else "2px"
+ arrow_shape = "triangle" if show_arrows else "none"
+ edge_label = "data(type)" if show_edge_labels else ""
+
+ return [
+ {
+ "selector": "node",
+ "style": {
+ "label": "data(label)",
+ "font-size": "10px",
+ "text-wrap": "wrap",
+ "text-max-width": "480px",
+ "color": tokens["text"],
+ "text-outline-width": 1,
+ "text-outline-color": tokens["outline"],
+ "width": node_size,
+ "height": node_size,
+ "background-color": "data(color)",
+ "z-index": 9999,
+ },
+ },
+ {
+ "selector": "edge",
+ "style": {
+ "curve-style": "bezier",
+ "line-color": "data(color)",
+ "target-arrow-color": "data(color)",
+ "target-arrow-shape": arrow_shape,
+ "arrow-scale": 0.8,
+ "opacity": 0.65,
+ "label": edge_label,
+ "font-size": "9px",
+ "color": tokens["edge_label"],
+ "width": edge_width,
+ "z-index": 5000,
+ },
+ },
+ {
+ "selector": ":selected",
+ "style": {
+ "border-width": 3,
+ "border-color": tokens["selection"],
+ },
+ },
+ {
+ "selector": ".highlight",
+ "style": {
+ "border-width": 3,
+ "border-color": tokens["selection"],
+ "opacity": 1,
+ },
+ },
+ {
+ "selector": ".dimmed",
+ "style": {
+ "opacity": tokens["dim"],
+ },
+ },
+ ]
diff --git a/apps/docling_graph/main.py b/apps/docling_graph/main.py
index b3dcfce..23c7fa7 100644
--- a/apps/docling_graph/main.py
+++ b/apps/docling_graph/main.py
@@ -1,18 +1,22 @@
from __future__ import annotations
-import os
import json
-from dash import Dash, html, dcc, Input, Output, State, no_update
+import logging
+import os
+import unittest
+from typing import Any, Dict, Iterable, List, Tuple
+
+from dash import ALL, Dash, Input, Output, State, dcc, html, no_update, ctx
import dash_cytoscape as cyto
-from .graph_builder import (
- build_graph_from_docling_json,
- list_docling_files,
- GraphPayload,
-)
+from .graph_builder import build_graph_from_docling_json, list_docling_files
+from .graph_styles import apply_theme_to_elements, base_stylesheet
APP_DIR = os.path.dirname(os.path.abspath(__file__))
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
# Enable stable extra layouts
try:
cyto.load_extra_layouts()
@@ -23,26 +27,333 @@
# -------------------------------------------------
# Helpers
# -------------------------------------------------
-def to_cytoscape_elements(graph: GraphPayload):
- return graph.nodes + graph.edges
+def safe_base_stylesheet(
+ theme: str,
+ scale_node_size: bool,
+ scale_edge_width: bool,
+ show_edge_labels: bool,
+ show_arrows: bool,
+) -> List[Dict[str, Any]]:
+ try:
+ return base_stylesheet(
+ theme,
+ scale_node_size,
+ scale_edge_width,
+ show_edge_labels,
+ show_arrows,
+ )
+ except Exception:
+ logger.exception("Failed to build stylesheet")
+ return []
+
+
+def _filter_graph(
+ graph: Dict[str, Any],
+ node_types: Iterable[str],
+ edge_types: Iterable[str],
+ hide_page_nodes: bool,
+ hide_isolated_nodes: bool,
+ min_node_weight: float,
+ min_edge_weight: float,
+ keep_context_nodes: bool,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ nodes = graph.get("nodes", []) if graph else []
+ edges = graph.get("edges", []) if graph else []
+
+ node_type_filter = {t for t in (node_types or [])}
+ edge_type_filter = {t for t in (edge_types or [])}
+
+ def node_passes(node: Dict[str, Any]) -> bool:
+ data = node.get("data", {})
+ node_type = data.get("type")
+ weight = data.get("weight", 0)
+ if hide_page_nodes and node_type == "Page":
+ return False
+ if node_type_filter and node_type not in node_type_filter:
+ return False
+ return weight >= (min_node_weight or 0)
+
+ def edge_passes(edge: Dict[str, Any]) -> bool:
+ data = edge.get("data", {})
+ edge_type = data.get("type")
+ weight = data.get("weight", 0)
+ if edge_type_filter and edge_type not in edge_type_filter:
+ return False
+ return weight >= (min_edge_weight or 0)
+
+ candidate_nodes = [node for node in nodes if node_passes(node)]
+ candidate_edges = [edge for edge in edges if edge_passes(edge)]
+
+ nodes_by_id = {node["data"]["id"]: node for node in nodes}
+ filtered_node_ids = {node["data"]["id"] for node in candidate_nodes}
+
+ edge_node_ids = {
+ node_id
+ for edge in candidate_edges
+ for node_id in (edge["data"].get("source"), edge["data"].get("target"))
+ if node_id
+ }
+
+ if keep_context_nodes:
+ context_node_ids = set()
+ for edge in candidate_edges:
+ source = edge["data"].get("source")
+ target = edge["data"].get("target")
+ if source in filtered_node_ids or target in filtered_node_ids:
+ if source:
+ context_node_ids.add(source)
+ if target:
+ context_node_ids.add(target)
+ filtered_node_ids |= context_node_ids
+
+ filtered_node_ids |= edge_node_ids
+
+ filtered_nodes = [
+ nodes_by_id[node_id]
+ for node_id in sorted(filtered_node_ids)
+ if node_id in nodes_by_id and node_passes(nodes_by_id[node_id])
+ ]
-def pages_from_nodes(nodes):
- return sorted(
- {
- n.get("data", {}).get("page")
- for n in nodes
- if n.get("data", {}).get("page") is not None
- }
+ filtered_edges = [
+ edge
+ for edge in candidate_edges
+ if edge["data"].get("source") in filtered_node_ids
+ and edge["data"].get("target") in filtered_node_ids
+ ]
+
+ if hide_isolated_nodes:
+ connected = set()
+ for edge in filtered_edges:
+ connected.add(edge["data"].get("source"))
+ connected.add(edge["data"].get("target"))
+ filtered_nodes = [node for node in filtered_nodes if node["data"]["id"] in connected]
+
+ return filtered_nodes, filtered_edges
+
+
+def _apply_highlight(
+ nodes: List[Dict[str, Any]],
+ edges: List[Dict[str, Any]],
+ highlight_ids: Iterable[str],
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ highlight_set = {hid for hid in (highlight_ids or [])}
+ if not highlight_set:
+ return nodes, edges
+
+ highlighted_nodes = []
+ for node in nodes:
+ node_id = node.get("data", {}).get("id")
+ classes = node.get("classes", "")
+ if node_id in highlight_set:
+ classes = f"{classes} highlight".strip()
+ else:
+ classes = f"{classes} dimmed".strip()
+ highlighted_nodes.append({**node, "classes": classes})
+
+ highlighted_edges = []
+ for edge in edges:
+ data = edge.get("data", {})
+ classes = edge.get("classes", "")
+ if data.get("source") in highlight_set or data.get("target") in highlight_set:
+ classes = f"{classes} highlight".strip()
+ else:
+ classes = f"{classes} dimmed".strip()
+ highlighted_edges.append({**edge, "classes": classes})
+
+ return highlighted_nodes, highlighted_edges
+
+
+def _group_connections(
+ edges: List[Dict[str, Any]],
+ node_id: str,
+) -> Dict[str, List[str]]:
+ groups: Dict[str, List[str]] = {}
+ for edge in edges:
+ data = edge.get("data", {})
+ edge_type = data.get("type")
+ source = data.get("source")
+ target = data.get("target")
+ if source == node_id:
+ key = f"Outgoing::{edge_type}"
+ groups.setdefault(key, []).append(target)
+ elif target == node_id:
+ key = f"Incoming::{edge_type}"
+ groups.setdefault(key, []).append(source)
+ return groups
+
+
+def _export_rows(graph: Dict[str, Any]) -> Tuple[List[List[Any]], List[List[Any]]]:
+ nodes = graph.get("nodes", []) if graph else []
+ edges = graph.get("edges", []) if graph else []
+ node_lookup = {node["data"]["id"]: node for node in nodes}
+
+ node_rows = [["Node Type", "Name", "Description", "Image", "Weight"]]
+ for node in nodes:
+ data = node.get("data", {})
+ node_rows.append(
+ [
+ data.get("type", ""),
+ data.get("label", ""),
+ data.get("description", ""),
+ data.get("image", ""),
+ data.get("weight", 0),
+ ]
+ )
+
+ edge_rows = [["From Type", "From Name", "Edge Type", "To Type", "To Name", "Weight"]]
+ for edge in edges:
+ data = edge.get("data", {})
+ source = node_lookup.get(data.get("source"), {}).get("data", {})
+ target = node_lookup.get(data.get("target"), {}).get("data", {})
+ edge_rows.append(
+ [
+ source.get("type", ""),
+ source.get("label", ""),
+ data.get("type", ""),
+ target.get("type", ""),
+ target.get("label", ""),
+ data.get("weight", 0),
+ ]
+ )
+
+ return node_rows, edge_rows
+
+
+def _csv_bytes(rows: List[List[Any]]) -> bytes:
+ import csv
+ import io
+
+ buffer = io.StringIO()
+ writer = csv.writer(buffer)
+ writer.writerows(rows)
+ return buffer.getvalue().encode("utf-8")
+
+
+def _xlsx_bytes(sheets: Dict[str, List[List[Any]]]) -> bytes:
+ import io
+ import zipfile
+
+ def column_name(index: int) -> str:
+ name = ""
+ while index:
+ index, rem = divmod(index - 1, 26)
+ name = chr(65 + rem) + name
+ return name
+
+ def xml_escape(value: str) -> str:
+ return (
+ value.replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">")
+ .replace("\"", """)
+ .replace("'", "'")
+ )
+
+ shared_strings: List[str] = []
+ shared_index: Dict[str, int] = {}
+
+ def shared_string(value: str) -> int:
+ if value not in shared_index:
+ shared_index[value] = len(shared_strings)
+ shared_strings.append(value)
+ return shared_index[value]
+
+ worksheets = {}
+ for sheet_index, (sheet_name, rows) in enumerate(sheets.items(), start=1):
+ row_xml = []
+ for row_idx, row in enumerate(rows, start=1):
+ cell_xml = []
+ for col_idx, value in enumerate(row, start=1):
+ cell_ref = f"{column_name(col_idx)}{row_idx}"
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
+ cell_xml.append(f"{value}")
+ else:
+ idx = shared_string(xml_escape(str(value)))
+ cell_xml.append(f"{idx}")
+ row_xml.append(f"{''.join(cell_xml)}
")
+ worksheet_xml = (
+ ""
+ ""
+ f"{''.join(row_xml)}"
+ ""
+ )
+ worksheets[f"xl/worksheets/sheet{sheet_index}.xml"] = worksheet_xml
+
+ shared_xml = (
+ ""
+ ""
+ + "".join(f"{value}" for value in shared_strings)
+ + ""
+ )
+
+ workbook_sheets = []
+ rels = []
+ for index, sheet_name in enumerate(sheets.keys(), start=1):
+ workbook_sheets.append(
+ f""
+ )
+ rels.append(
+ f""
+ )
+
+ workbook_xml = (
+ ""
+ ""
+ f"{''.join(workbook_sheets)}"
+ ""
+ )
+
+ workbook_rels_xml = (
+ ""
+ ""
+ + "".join(rels)
+ + ""
+ )
+
+ rels_xml = (
+ ""
+ ""
+ ""
+ ""
)
+ content_types = (
+ ""
+ ""
+ ""
+ ""
+ ""
+ ""
+ + "".join(
+ f""
+ for index in range(1, len(sheets) + 1)
+ )
+ + ""
+ )
+
+ buffer = io.BytesIO()
+ with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as archive:
+ archive.writestr("[Content_Types].xml", content_types)
+ archive.writestr("_rels/.rels", rels_xml)
+ archive.writestr("xl/workbook.xml", workbook_xml)
+ archive.writestr("xl/_rels/workbook.xml.rels", workbook_rels_xml)
+ archive.writestr("xl/sharedStrings.xml", shared_xml)
+ for path, data in worksheets.items():
+ archive.writestr(path, data)
+
+ return buffer.getvalue()
+
# -------------------------------------------------
-# Layout + Styles (dark-mode, inverted demo)
+# Layout + Styles
# -------------------------------------------------
LAYOUTS = [
("Dagre (sequence)", "dagre"),
("Breadthfirst", "breadthfirst"),
+ ("ForceAtlas2", "forceatlas2"),
("Force-directed (COSE)", "cose"),
("COSE-Bilkent", "cose-bilkent"),
("Cola (read text)", "cola"),
@@ -50,7 +361,7 @@ def pages_from_nodes(nodes):
]
-def layout_for(name: str, scaling_ratio: int):
+def layout_for(name: str, scaling_ratio: int) -> Dict[str, Any]:
s = max(50, min(int(scaling_ratio or 250), 800))
name = name or "dagre"
@@ -76,6 +387,19 @@ def layout_for(name: str, scaling_ratio: int):
"padding": 30,
}
+ if name == "forceatlas2":
+ return {
+ "name": "forceatlas2",
+ "iterations": 800,
+ "scalingRatio": max(0.5, s / 100),
+ "gravity": 1.0,
+ "linLogMode": False,
+ "preventOverlap": True,
+ "fit": True,
+ "padding": 30,
+ "animate": False,
+ }
+
if name in ("cose", "cose-bilkent", "euler"):
return {
"name": name,
@@ -92,45 +416,6 @@ def layout_for(name: str, scaling_ratio: int):
return {"name": name, "fit": True, "padding": 30}
-def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels):
- return [
- {
- "selector": "node",
- "style": {
- "label": "data(label)",
- "font-size": f"{font_size}px",
- "text-wrap": "wrap",
- "text-max-width": f"{text_max_width}px",
- "color": "#E5E7EB",
- "text-outline-width": 1,
- "text-outline-color": "#0B1220",
- "width": f"{node_size}px",
- "height": f"{node_size}px",
- "z-index": 9999,
- },
- },
- {
- "selector": "edge",
- "style": {
- "curve-style": "bezier",
- "line-color": "#64748B",
- "target-arrow-color": "#64748B",
- "target-arrow-shape": "triangle",
- "arrow-scale": 0.8,
- "opacity": 0.55,
- "label": "data(rel)" if show_edge_labels else "",
- "font-size": "9px",
- "color": "#CBD5E1",
- "z-index": 5000,
- },
- },
- {"selector": ".document", "style": {"background-color": "#1D4ED8"}},
- {"selector": ".section", "style": {"background-color": "#111827"}},
- {"selector": ".item", "style": {"background-color": "#334155"}},
- {"selector": ":selected", "style": {"border-width": 3, "border-color": "#FBBF24"}},
- ]
-
-
# -------------------------------------------------
# App + Defaults
# -------------------------------------------------
@@ -139,12 +424,14 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels):
DEFAULT_VIEW = {
"layout": "dagre",
"scaling_ratio": 250,
- "node_size": 22,
- "font_size": 10,
- "text_max_width": 520,
"show_edge_labels": False,
+ "show_arrows": True,
+ "scale_node_size": True,
+ "scale_edge_width": True,
}
+DEFAULT_THEME = "dark"
+
app = Dash(
__name__,
title="Docling Graph Viewer",
@@ -154,26 +441,168 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels):
# -------------------------------------------------
-# Layout (demo-style + stores)
+# Layout
# -------------------------------------------------
app.layout = html.Div(
+ className="docling-app",
children=[
dcc.Store(id="store_graph"),
- dcc.Store(id="store_node_index"),
-
+ dcc.Store(id="store_filtered_graph"),
+ dcc.Store(id="store_metadata"),
+ dcc.Store(id="store_selected_node"),
+ dcc.Store(id="store_inspector_expansion", data={}),
+ dcc.Store(id="store_highlight", data=[]),
+ dcc.Store(id="store_theme", data=DEFAULT_THEME),
+ dcc.Download(id="download-export"),
html.Div(
- className="row",
+ className="app-grid",
children=[
- # LEFT — Graph
html.Div(
- className="eight columns",
+ className="panel-left",
+ children=[
+ html.Div("Graph controls", className="panel-title"),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Div("Document", className="control-label"),
+ dcc.Dropdown(
+ id="file",
+ options=[{"label": f, "value": f} for f in files],
+ value=(files[0] if files else None),
+ clearable=False,
+ ),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Div("Search", className="control-label"),
+ dcc.Dropdown(
+ id="search_node",
+ options=[],
+ placeholder="Search by node name",
+ clearable=True,
+ ),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Div("Node Types", className="control-label"),
+ dcc.Dropdown(
+ id="node_type_filter",
+ options=[],
+ multi=True,
+ placeholder="Filter node types",
+ ),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Div("Edge Types", className="control-label"),
+ dcc.Dropdown(
+ id="edge_type_filter",
+ options=[],
+ multi=True,
+ placeholder="Filter edge types",
+ ),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Div("Min node weight", className="control-label"),
+ dcc.Slider(
+ id="min_node_weight",
+ min=0,
+ max=25,
+ step=1,
+ value=0,
+ ),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Div("Min edge weight", className="control-label"),
+ dcc.Slider(
+ id="min_edge_weight",
+ min=0,
+ max=10,
+ step=1,
+ value=0,
+ ),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ dcc.Checklist(
+ id="graph_toggles",
+ options=[
+ {"label": " Hide Page nodes", "value": "hide_pages"},
+ {"label": " Hide isolated nodes", "value": "hide_isolated"},
+ {"label": " Show edge labels", "value": "edge_labels"},
+ {"label": " Show arrows", "value": "arrows"},
+ {"label": " Keep context nodes", "value": "keep_context"},
+ {"label": " Scale node size", "value": "scale_node"},
+ {"label": " Scale edge width", "value": "scale_edge"},
+ ],
+ value=[
+ "arrows",
+ "scale_node",
+ "scale_edge",
+ ],
+ inputClassName="control-checkbox",
+ labelClassName="control-checkbox__label",
+ )
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Div("Layout", className="control-label"),
+ dcc.Dropdown(
+ id="layout",
+ options=[{"label": l, "value": v} for l, v in LAYOUTS],
+ value=DEFAULT_VIEW["layout"],
+ clearable=False,
+ ),
+ html.Div("Scaling ratio", className="control-label"),
+ dcc.Slider(
+ id="scaling_ratio",
+ min=50,
+ max=800,
+ step=10,
+ value=DEFAULT_VIEW["scaling_ratio"],
+ ),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Button("Reset view", id="reset_view", className="btn-primary"),
+ ],
+ ),
+ html.Div(
+ className="control-section",
+ children=[
+ html.Button("Export CSV", id="export_csv", className="btn"),
+ html.Button("Export XLSX", id="export_xlsx", className="btn"),
+ ],
+ ),
+ ],
+ ),
+ html.Div(
+ className="panel-main",
children=[
cyto.Cytoscape(
id="graph",
style={
"width": "100%",
"height": "85vh",
- "backgroundColor": "#0B0F17",
+ "backgroundColor": "var(--gc-graph-bg)",
},
wheelSensitivity=0.01,
minZoom=0.25,
@@ -182,161 +611,36 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels):
DEFAULT_VIEW["layout"],
DEFAULT_VIEW["scaling_ratio"],
),
- stylesheet=base_stylesheet(
- DEFAULT_VIEW["node_size"],
- DEFAULT_VIEW["font_size"],
- DEFAULT_VIEW["text_max_width"],
+ stylesheet=safe_base_stylesheet(
+ DEFAULT_THEME,
+ DEFAULT_VIEW["scale_node_size"],
+ DEFAULT_VIEW["scale_edge_width"],
DEFAULT_VIEW["show_edge_labels"],
+ DEFAULT_VIEW["show_arrows"],
),
elements=[],
- )
+ ),
],
),
-
- # RIGHT — Control Panel
html.Div(
- className="four columns",
+ className="panel-right",
children=[
- dcc.Tabs(
- className="control-tabs",
- colors={
- "border": "#111827",
- "primary": "#7c3aed",
- "background": "#0b0f17",
- },
- children=[
- dcc.Tab(
- className="control-tab",
- selected_className="control-tab--selected",
- label="Control Panel",
- children=[
- html.Div(
- className="control-panel",
- children=[
- html.Div(
- className="control-panel__header",
- children=[
- html.Div("Graph controls", className="control-title"),
- html.Span("Dark mode", className="pill pill--invert"),
- ],
- ),
- html.Div(
- className="control-section",
- children=[
- html.Div("Document", className="control-label"),
- dcc.Dropdown(
- id="file",
- options=[{"label": f, "value": f} for f in files],
- value=(files[0] if files else None),
- clearable=False,
- ),
- ],
- ),
- html.Div(
- className="control-section",
- children=[
- html.Div("Page range", className="control-label"),
- dcc.RangeSlider(
- id="page_range",
- min=1,
- max=1,
- step=1,
- value=[1, 1],
- marks={},
- allowCross=False,
- ),
- html.Div(
- id="page_range_value",
- className="control-subtext",
- ),
- ],
- ),
- html.Div(
- className="control-section",
- children=[
- html.Div("Layout", className="control-label"),
- dcc.Dropdown(
- id="layout",
- options=[{"label": l, "value": v} for l, v in LAYOUTS],
- value=DEFAULT_VIEW["layout"],
- clearable=False,
- ),
- ],
- ),
- html.Div(
- className="control-section",
- children=[
- html.Div("Scaling ratio", className="control-label"),
- dcc.Slider(
- id="scaling_ratio",
- min=50,
- max=800,
- step=10,
- value=DEFAULT_VIEW["scaling_ratio"],
- ),
- ],
- ),
- html.Div(
- className="control-section",
- children=[
- html.Div("Expand", className="control-label"),
- dcc.RadioItems(
- id="expand_mode",
- options=[
- {"label": "Children (hier)", "value": "children"},
- {"label": "All outgoing", "value": "out"},
- {"label": "All incoming", "value": "in"},
- ],
- value="children",
- inputClassName="control-radio",
- labelClassName="control-radio__label",
- ),
- ],
- ),
- html.Div(
- className="control-section control-section--tight",
- children=[
- dcc.Checklist(
- id="edge_labels",
- options=[{"label": " Show edge labels", "value": "on"}],
- value=[],
- inputClassName="control-checkbox",
- labelClassName="control-checkbox__label",
- ),
- ],
- ),
- ],
- )
- ],
- ),
- dcc.Tab(
- className="control-tab",
- selected_className="control-tab--selected",
- label="JSON",
- children=[
- html.Div(
- className="control-panel control-panel--secondary",
- children=[
- html.Div(
- className="control-panel__header",
- children=[
- html.Div("Click to inspect", className="control-title"),
- html.Span("debug", className="pill"),
- ],
- ),
- html.Pre(id="tap-node-json-output", style={"height": "35vh", "overflowY": "auto"}),
- html.Pre(id="tap-edge-json-output", style={"height": "35vh", "overflowY": "auto"}),
- ],
- )
- ],
- ),
- ]
- )
+ html.Div("Inspector", className="panel-title"),
+ html.Div(id="inspector_panel", className="inspector-panel"),
+ html.Button("Reset highlights", id="reset_highlight", className="btn"),
],
),
],
),
- ]
+ html.Div(
+ className="debug-panel",
+ children=[
+ html.Div("Debug JSON", className="panel-title"),
+ html.Pre(id="tap-node-json-output", className="debug-output"),
+ html.Pre(id="tap-edge-json-output", className="debug-output"),
+ ],
+ ),
+ ],
)
@@ -346,179 +650,283 @@ def base_stylesheet(node_size, font_size, text_max_width, show_edge_labels):
@app.callback(
Output("graph", "elements"),
Output("store_graph", "data"),
- Output("store_node_index", "data"),
+ Output("store_metadata", "data"),
Input("file", "value"),
)
def load_graph(path):
if not path:
return [], None, None
- g = build_graph_from_docling_json(path)
-
- node_index = {n["data"]["id"]: n for n in g.nodes if n.get("data", {}).get("id")}
-
- # Genesis node: document
- doc_node = next((n for n in g.nodes if n["data"].get("type") == "document"), None)
- elements = [doc_node] if doc_node else []
+ graph = build_graph_from_docling_json(path)
+ node_types = sorted({n["data"].get("type", "") for n in graph.nodes})
+ edge_types = sorted({e["data"].get("type", "") for e in graph.edges})
+ search_options = [
+ {"label": n["data"].get("label"), "value": n["data"].get("id")}
+ for n in graph.nodes
+ ]
+ metadata = {
+ "node_types": node_types,
+ "edge_types": edge_types,
+ "search_options": search_options,
+ }
+ store_graph = {"nodes": graph.nodes, "edges": graph.edges}
+ return [], store_graph, metadata
- store_graph = {"nodes": g.nodes, "edges": g.edges}
- return elements, store_graph, node_index
+@app.callback(
+ Output("node_type_filter", "options"),
+ Output("edge_type_filter", "options"),
+ Output("search_node", "options"),
+ Input("store_metadata", "data"),
+)
+def update_filters(metadata):
+ if not metadata:
+ return [], [], []
+ node_options = [{"label": t, "value": t} for t in metadata.get("node_types", [])]
+ edge_options = [{"label": t, "value": t} for t in metadata.get("edge_types", [])]
+ return node_options, edge_options, metadata.get("search_options", [])
@app.callback(
- Output("page_range", "min"),
- Output("page_range", "max"),
- Output("page_range", "value"),
- Output("page_range", "marks"),
- Input("file", "value"),
+ Output("graph", "elements", allow_duplicate=True),
+ Output("store_filtered_graph", "data"),
+ Input("store_graph", "data"),
+ Input("node_type_filter", "value"),
+ Input("edge_type_filter", "value"),
+ Input("graph_toggles", "value"),
+ Input("min_node_weight", "value"),
+ Input("min_edge_weight", "value"),
+ Input("store_theme", "data"),
+ Input("store_highlight", "data"),
+ prevent_initial_call="initial_duplicate",
)
-def init_page_range(path):
- if not path:
- return 1, 1, [1, 1], {}
+def apply_filters(
+ graph,
+ node_types,
+ edge_types,
+ toggles,
+ min_node_weight,
+ min_edge_weight,
+ theme,
+ highlight_ids,
+):
+ if not graph:
+ return [], None
+
+ toggles = toggles or []
+ hide_page_nodes = "hide_pages" in toggles
+ hide_isolated = "hide_isolated" in toggles
+ keep_context = "keep_context" in toggles
+
+ nodes, edges = _filter_graph(
+ graph,
+ node_types,
+ edge_types,
+ hide_page_nodes,
+ hide_isolated,
+ min_node_weight or 0,
+ min_edge_weight or 0,
+ keep_context,
+ )
- g = build_graph_from_docling_json(path)
- pages = pages_from_nodes(g.nodes)
- if not pages:
- return 1, 1, [1, 1], {}
+ themed_nodes, themed_edges = apply_theme_to_elements(nodes, edges, theme or DEFAULT_THEME)
+ themed_nodes, themed_edges = _apply_highlight(themed_nodes, themed_edges, highlight_ids)
- pmin, pmax = pages[0], pages[-1]
- return pmin, pmax, [pmin, min(pmax, pmin + 3)], {p: str(p) for p in pages if p == pmin or p == pmax or p % 5 == 0}
+ filtered_graph = {"nodes": themed_nodes, "edges": themed_edges}
+ return themed_nodes + themed_edges, filtered_graph
-@app.callback(Output("page_range_value", "children"), Input("page_range", "value"))
-def show_page_range(value):
- if not value:
- return ""
+@app.callback(
+ Output("graph", "layout"),
+ Input("layout", "value"),
+ Input("scaling_ratio", "value"),
+)
+def update_layout(name, scaling):
+ return layout_for(name, scaling)
- start, end = value
- if start == end:
- return f"Showing page {start}"
- return f"Showing pages {start} to {end}"
+@app.callback(
+ Output("graph", "stylesheet"),
+ Input("graph_toggles", "value"),
+ Input("store_theme", "data"),
+)
+def update_styles(toggles, theme):
+ toggles = toggles or []
+ return safe_base_stylesheet(
+ theme or DEFAULT_THEME,
+ "scale_node" in toggles,
+ "scale_edge" in toggles,
+ "edge_labels" in toggles,
+ "arrows" in toggles,
+ )
@app.callback(
- Output("graph", "elements", allow_duplicate=True),
+ Output("store_selected_node", "data"),
Input("graph", "tapNodeData"),
- State("graph", "elements"),
- State("store_graph", "data"),
- State("store_node_index", "data"),
- State("expand_mode", "value"),
- State("page_range", "value"),
+ Input("search_node", "value"),
+ State("store_filtered_graph", "data"),
prevent_initial_call=True,
)
-def expand_on_click(node_data, elements, store_graph, node_index, mode, page_range):
- if not node_data or not store_graph or not page_range:
- return no_update
-
- node_id = node_data.get("id")
- if not node_id:
- return no_update
-
- start_page, end_page = page_range
-
- def in_range(page):
- return page is None or (start_page <= page <= end_page)
-
- existing_nodes = {e["data"]["id"] for e in elements if "id" in e.get("data", {})}
- existing_edges = {e["data"]["id"] for e in elements if "source" in e.get("data", {})}
-
- for e in elements:
- if e.get("data", {}).get("id") == node_id:
- e["data"]["expanded"] = True
+def select_node(tap_node, search_value, filtered_graph):
+ trigger = ctx.triggered_id
+ if trigger == "search_node" and search_value:
+ return search_value
+ if tap_node and tap_node.get("id"):
+ return tap_node["id"]
+ return no_update
- new_nodes = []
- new_edges = []
- for ed in store_graph["edges"]:
- d = ed["data"]
- src, tgt, rel = d.get("source"), d.get("target"), d.get("rel")
-
- if mode == "children" and not (rel == "hier" and src == node_id):
- continue
- if mode == "out" and src != node_id:
- continue
- if mode == "in" and tgt != node_id:
- continue
+@app.callback(
+ Output("inspector_panel", "children"),
+ Input("store_filtered_graph", "data"),
+ Input("store_selected_node", "data"),
+ Input("store_inspector_expansion", "data"),
+)
+def render_inspector(filtered_graph, selected_node_id, expansion_state):
+ if not filtered_graph or not selected_node_id:
+ return html.Div("Select a node to inspect.", className="muted")
+
+ nodes = filtered_graph.get("nodes", [])
+ edges = filtered_graph.get("edges", [])
+ node_lookup = {node["data"]["id"]: node for node in nodes}
+ node = node_lookup.get(selected_node_id)
+ if not node:
+ return html.Div("Select a node to inspect.", className="muted")
+
+ data = node.get("data", {})
+ groups = _group_connections(edges, selected_node_id)
+ expansion_state = expansion_state or {}
+
+ group_blocks = []
+ for group_key in sorted(groups.keys()):
+ direction, edge_type = group_key.split("::", 1)
+ node_ids = groups[group_key]
+ total = len(node_ids)
+ shown = expansion_state.get(group_key, 10)
+ display_ids = node_ids[:shown]
+
+ connections = []
+ for nid in display_ids:
+ target_node = node_lookup.get(nid)
+ label = target_node.get("data", {}).get("label", nid) if target_node else nid
+ connections.append(html.Div(label, className="inspector-item"))
+
+ footer = None
+ if shown < total:
+ footer = html.Button(
+ f"Show more (+25)",
+ id={"type": "expand-group", "group": group_key},
+ className="btn-small",
+ )
+
+ group_blocks.append(
+ html.Div(
+ className="inspector-group",
+ children=[
+ html.Div(
+ f"{direction} → {edge_type} ({total})",
+ className="inspector-group-title",
+ ),
+ html.Div(connections, className="inspector-list"),
+ footer,
+ ],
+ )
+ )
+
+ if not group_blocks:
+ group_blocks.append(html.Div("No connections in current filter.", className="muted"))
+
+ return html.Div(
+ children=[
+ html.Div(data.get("label", ""), className="inspector-title"),
+ html.Div(data.get("type", ""), className="inspector-subtitle"),
+ html.Div(data.get("description", ""), className="inspector-description"),
+ html.Div(group_blocks),
+ ]
+ )
- if d["id"] in existing_edges:
- continue
- for nid in (src, tgt):
- if nid not in existing_nodes and nid in node_index:
- candidate = node_index[nid]
- if in_range(candidate.get("data", {}).get("page")):
- new_nodes.append(candidate)
+@app.callback(
+ Output("store_inspector_expansion", "data"),
+ Output("store_highlight", "data"),
+ Input({"type": "expand-group", "group": ALL}, "n_clicks"),
+ Input("reset_highlight", "n_clicks"),
+ State("store_inspector_expansion", "data"),
+ State("store_filtered_graph", "data"),
+ State("store_selected_node", "data"),
+ prevent_initial_call=True,
+)
+def update_expansion(
+ _clicks,
+ reset_clicks,
+ expansion_state,
+ filtered_graph,
+ selected_node_id,
+):
+ if ctx.triggered_id == "reset_highlight":
+ return {}, []
- if not in_range(node_index.get(src, {}).get("data", {}).get("page")):
- continue
- if not in_range(node_index.get(tgt, {}).get("data", {}).get("page")):
- continue
+ triggered = ctx.triggered_id
+ if not isinstance(triggered, dict):
+ return no_update, no_update
- new_edges.append(ed)
+ group_key = triggered.get("group")
+ expansion_state = expansion_state or {}
+ current = expansion_state.get(group_key, 10)
+ expansion_state[group_key] = current + 25
- if not new_nodes and not new_edges:
- return no_update
+ highlight_ids: List[str] = []
+ if filtered_graph and selected_node_id:
+ groups = _group_connections(filtered_graph.get("edges", []), selected_node_id)
+ highlight_ids = [selected_node_id] + groups.get(group_key, [])
- return elements + new_nodes + new_edges
+ return expansion_state, highlight_ids
@app.callback(
- Output("graph", "layout"),
- Input("layout", "value"),
- Input("scaling_ratio", "value"),
+ Output("graph", "elements", allow_duplicate=True),
+ Output("graph", "elements", allow_duplicate=True),
+ Output("store_selected_node", "data", allow_duplicate=True),
+ Output("store_highlight", "data", allow_duplicate=True),
+ Input("reset_view", "n_clicks"),
+ State("store_filtered_graph", "data"),
+ State("store_theme", "data"),
+ prevent_initial_call=True,
)
-def update_layout(name, scaling):
- return layout_for(name, scaling)
+def reset_view(_n_clicks, filtered_graph, theme):
+ if not filtered_graph:
+ return no_update, no_update, no_update
+ nodes = filtered_graph.get("nodes", [])
+ edges = filtered_graph.get("edges", [])
+ themed_nodes, themed_edges = apply_theme_to_elements(nodes, edges, theme or DEFAULT_THEME)
+ return themed_nodes + themed_edges, None, []
@app.callback(
- Output("graph", "elements", allow_duplicate=True),
- Input("page_range", "value"),
- State("graph", "elements"),
+ Output("download-export", "data"),
+ Input("export_csv", "n_clicks"),
+ Input("export_xlsx", "n_clicks"),
+ State("store_filtered_graph", "data"),
prevent_initial_call=True,
)
-def filter_elements_by_page(page_range, elements):
- if not page_range or not elements:
+def export_graph(csv_clicks, xlsx_clicks, filtered_graph):
+ if not filtered_graph:
return no_update
- start_page, end_page = page_range
- allowed_nodes = set()
- filtered_nodes = []
-
- for el in elements:
- data = el.get("data", {})
- if "source" in data:
- continue
+ nodes_rows, edge_rows = _export_rows(filtered_graph)
+ if ctx.triggered_id == "export_csv":
+ import io
+ import zipfile
- page = data.get("page")
- if page is None or (start_page <= page <= end_page):
- allowed_nodes.add(data.get("id"))
- filtered_nodes.append(el)
-
- filtered_edges = [
- el
- for el in elements
- if "source" in el.get("data", {})
- and el["data"].get("source") in allowed_nodes
- and el["data"].get("target") in allowed_nodes
- ]
+ buffer = io.BytesIO()
+ with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as archive:
+ archive.writestr("graph_nodes.csv", _csv_bytes(nodes_rows))
+ archive.writestr("graph_edges.csv", _csv_bytes(edge_rows))
+ return dcc.send_bytes(buffer.getvalue(), "graph_export.zip")
- return filtered_nodes + filtered_edges
-
-
-@app.callback(
- Output("graph", "stylesheet"),
- Input("edge_labels", "value"),
-)
-def update_styles(edge_labels):
- return base_stylesheet(
- DEFAULT_VIEW["node_size"],
- DEFAULT_VIEW["font_size"],
- DEFAULT_VIEW["text_max_width"],
- "on" in (edge_labels or []),
- )
+ xlsx_bytes = _xlsx_bytes({"Nodes": nodes_rows, "Edges": edge_rows})
+ return dcc.send_bytes(xlsx_bytes, "graph_export.xlsx")
@app.callback(
@@ -537,6 +945,19 @@ def show_edge(data):
return json.dumps(data, indent=2)
+# -------------------------------------------------
+# Tests
+# -------------------------------------------------
+class LayoutTests(unittest.TestCase):
+ def test_forceatlas2_is_in_layouts(self):
+ layout_values = {value for _, value in LAYOUTS}
+ self.assertIn("forceatlas2", layout_values)
+
+ def test_layout_for_forceatlas2(self):
+ layout = layout_for("forceatlas2", 250)
+ self.assertEqual(layout.get("name"), "forceatlas2")
+
+
# -------------------------------------------------
# Run
# -------------------------------------------------
diff --git a/tests/test_docling_json_smoke_adm_2024.py b/tests/test_docling_json_smoke_adm_2024.py
new file mode 100644
index 0000000..3156f9b
--- /dev/null
+++ b/tests/test_docling_json_smoke_adm_2024.py
@@ -0,0 +1,135 @@
+import json
+import sys
+import warnings
+from pathlib import Path
+
+import pytest
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from apps.docling_graph.graph_builder import build_elements_from_docling_json
+
+
+ADM_PATH = Path(
+ "docling-ws/data/docling/building_standards/ADM__V2_Amendment_Booklet_2024.json"
+)
+
+
+def _normalize_doc_root(doc):
+ if isinstance(doc, dict):
+ for key in ("document", "doc"):
+ candidate = doc.get(key)
+ if isinstance(candidate, dict):
+ return candidate
+ return doc
+
+
+def _node_id(item, path):
+ for key in ("self_ref", "id", "uid"):
+ value = item.get(key)
+ if value:
+ return str(value)
+ label = str(item.get("label") or item.get("type") or "node")
+ page_no = None
+ prov = item.get("prov") or item.get("provenance") or item.get("provenances")
+ if isinstance(prov, list) and prov:
+ page_no = prov[0].get("page_no")
+ return f"{label}::{'/'.join(str(p) for p in path)}::{page_no or 'na'}"
+
+
+def _collect_nodes_and_edges(doc_root):
+ nodes = {}
+ edges = []
+ seen = set()
+ stack = [(doc_root, ("root",))]
+
+ while stack:
+ item, path = stack.pop()
+ if isinstance(item, dict):
+ item_id = id(item)
+ if item_id in seen:
+ continue
+ seen.add(item_id)
+
+ node_id = _node_id(item, path)
+ nodes[node_id] = item
+
+ children = item.get("children")
+ if isinstance(children, list):
+ for idx, child in enumerate(children):
+ if isinstance(child, dict):
+ child_path = path + ("children", idx)
+ child_id = _node_id(child, child_path)
+ edges.append((node_id, child_id))
+ stack.append((child, child_path))
+
+ for key, value in item.items():
+ if isinstance(value, (dict, list)) and key != "children":
+ stack.append((value, path + (key,)))
+
+ elif isinstance(item, list):
+ item_id = id(item)
+ if item_id in seen:
+ continue
+ seen.add(item_id)
+ for idx, child in enumerate(item):
+ stack.append((child, path + (idx,)))
+
+ return nodes, edges
+
+
+def _has_provenance(item):
+ prov = item.get("prov") or item.get("provenance") or item.get("provenances")
+ if isinstance(prov, list):
+ return any(isinstance(p, dict) and (p.get("page_no") or p.get("bbox")) for p in prov)
+ if isinstance(prov, dict):
+ return bool(prov.get("page_no") or prov.get("bbox"))
+ return False
+
+
+@pytest.mark.skipif(not ADM_PATH.exists(), reason="ADM_2024 fixture not available")
+def test_docling_json_smoke_adm_2024():
+ with ADM_PATH.open("r", encoding="utf-8") as handle:
+ doc = json.load(handle)
+
+ doc_root = _normalize_doc_root(doc)
+ nodes, edges = _collect_nodes_and_edges(doc_root)
+
+ assert nodes, "No nodes collected from ADM doc"
+ assert edges, "No hierarchy edges collected from ADM doc"
+
+ node_ids = set(nodes.keys())
+ dangling = [edge for edge in edges if edge[0] not in node_ids or edge[1] not in node_ids]
+ assert not dangling, f"Found dangling edges: {dangling[:5]}"
+
+ provenance_count = sum(1 for item in nodes.values() if _has_provenance(item))
+ if provenance_count == 0:
+ warnings.warn("No provenance found in ADM doc nodes; allowed but should be verified", RuntimeWarning)
+
+
+@pytest.mark.skipif(not ADM_PATH.exists(), reason="ADM_2024 fixture not available")
+def test_graph_builder_elements_adm_2024():
+ with ADM_PATH.open("r", encoding="utf-8") as handle:
+ doc = json.load(handle)
+
+ elements = build_elements_from_docling_json(doc, doc_name=ADM_PATH.name)
+
+ assert isinstance(elements, list)
+ node_ids = set()
+ edge_count = 0
+
+ for element in elements:
+ data = element.get("data", {})
+ if "source" in data:
+ edge_count += 1
+ assert data.get("source")
+ assert data.get("target")
+ else:
+ node_id = data.get("id")
+ assert node_id
+ assert node_id not in node_ids
+ node_ids.add(node_id)
+
+ assert node_ids
+ assert edge_count > 0